4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/sysmacros.h>
31 #include <sys/pathname.h>
32 #include <sys/vnode.h>
34 #include <sys/vfs_opreg.h>
35 #include <sys/mntent.h>
36 #include <sys/mount.h>
37 #include <sys/cmn_err.h>
38 #include "fs/fs_subr.h"
39 #include <sys/zfs_znode.h>
40 #include <sys/zfs_dir.h>
42 #include <sys/fs/zfs.h>
44 #include <sys/dsl_prop.h>
45 #include <sys/dsl_dataset.h>
46 #include <sys/dsl_deleg.h>
49 #include <sys/varargs.h>
50 #include <sys/policy.h>
51 #include <sys/atomic.h>
52 #include <sys/mkdev.h>
53 #include <sys/modctl.h>
54 #include <sys/refstr.h>
55 #include <sys/zfs_ioctl.h>
56 #include <sys/zfs_ctldir.h>
57 #include <sys/zfs_fuid.h>
58 #include <sys/bootconf.h>
59 #include <sys/sunddi.h>
61 #include <sys/dmu_objset.h>
62 #include <sys/spa_boot.h>
65 vfsops_t *zfs_vfsops = NULL;
66 static major_t zfs_major;
67 static minor_t zfs_minor;
68 static kmutex_t zfs_dev_mtx;
70 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
71 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
72 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
73 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
74 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
75 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
76 static void zfs_freevfs(vfs_t *vfsp);
78 static const fs_operation_def_t zfs_vfsops_template[] = {
79 VFSNAME_MOUNT, { .vfs_mount = zfs_mount },
80 VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot },
81 VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount },
82 VFSNAME_ROOT, { .vfs_root = zfs_root },
83 VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs },
84 VFSNAME_SYNC, { .vfs_sync = zfs_sync },
85 VFSNAME_VGET, { .vfs_vget = zfs_vget },
86 VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs },
90 static const fs_operation_def_t zfs_vfsops_eio_template[] = {
91 VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs },
96 * We need to keep a count of active fs's.
97 * This is necessary to prevent our module
98 * from being unloaded after a umount -f
100 static uint32_t zfs_active_fs_count = 0;
102 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
103 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
104 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
105 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
108 * MO_DEFAULT is not used since the default value is determined
109 * by the equivalent property.
111 static mntopt_t mntopts[] = {
112 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
113 { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
114 { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
115 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
118 static mntopts_t zfs_mntopts = {
119 sizeof (mntopts) / sizeof (mntopt_t),
125 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
128 * Data integrity is job one. We don't want a compromised kernel
129 * writing to the storage pool, so we never sync during panic.
135 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
136 * to sync metadata, which they would otherwise cache indefinitely.
137 * Semantically, the only requirement is that the sync be initiated.
138 * The DMU syncs out txgs frequently, so there's nothing to do.
140 if (flag & SYNC_ATTR)
145 * Sync a specific filesystem.
147 zfsvfs_t *zfsvfs = vfsp->vfs_data;
150 if (zfsvfs->z_log != NULL)
151 zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
153 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
157 * Sync all ZFS filesystems. This is what happens when you
158 * run sync(1M). Unlike other filesystems, ZFS honors the
159 * request by waiting for all pools to commit all dirty data.
168 zfs_create_unique_device(dev_t *dev)
173 ASSERT3U(zfs_minor, <=, MAXMIN32);
174 minor_t start = zfs_minor;
176 mutex_enter(&zfs_dev_mtx);
177 if (zfs_minor >= MAXMIN32) {
179 * If we're still using the real major
180 * keep out of /dev/zfs and /dev/zvol minor
181 * number space. If we're using a getudev()'ed
182 * major number, we can use all of its minors.
184 if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
185 zfs_minor = ZFS_MIN_MINOR;
191 *dev = makedevice(zfs_major, zfs_minor);
192 mutex_exit(&zfs_dev_mtx);
193 } while (vfs_devismounted(*dev) && zfs_minor != start);
194 if (zfs_minor == start) {
196 * We are using all ~262,000 minor numbers for the
197 * current major number. Create a new major number.
199 if ((new_major = getudev()) == (major_t)-1) {
201 "zfs_mount: Can't get unique major "
205 mutex_enter(&zfs_dev_mtx);
206 zfs_major = new_major;
209 mutex_exit(&zfs_dev_mtx);
213 /* CONSTANTCONDITION */
220 atime_changed_cb(void *arg, uint64_t newval)
222 zfsvfs_t *zfsvfs = arg;
224 if (newval == TRUE) {
225 zfsvfs->z_atime = TRUE;
226 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
227 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
229 zfsvfs->z_atime = FALSE;
230 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
231 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
236 xattr_changed_cb(void *arg, uint64_t newval)
238 zfsvfs_t *zfsvfs = arg;
240 if (newval == TRUE) {
241 /* XXX locking on vfs_flag? */
242 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
243 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
244 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
246 /* XXX locking on vfs_flag? */
247 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
248 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
249 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
254 blksz_changed_cb(void *arg, uint64_t newval)
256 zfsvfs_t *zfsvfs = arg;
258 if (newval < SPA_MINBLOCKSIZE ||
259 newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
260 newval = SPA_MAXBLOCKSIZE;
262 zfsvfs->z_max_blksz = newval;
263 zfsvfs->z_vfs->vfs_bsize = newval;
267 readonly_changed_cb(void *arg, uint64_t newval)
269 zfsvfs_t *zfsvfs = arg;
272 /* XXX locking on vfs_flag? */
273 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
274 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
275 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
277 /* XXX locking on vfs_flag? */
278 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
279 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
280 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
285 devices_changed_cb(void *arg, uint64_t newval)
287 zfsvfs_t *zfsvfs = arg;
289 if (newval == FALSE) {
290 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
291 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
292 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
294 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
295 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
296 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
301 setuid_changed_cb(void *arg, uint64_t newval)
303 zfsvfs_t *zfsvfs = arg;
305 if (newval == FALSE) {
306 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
307 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
308 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
310 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
311 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
312 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
317 exec_changed_cb(void *arg, uint64_t newval)
319 zfsvfs_t *zfsvfs = arg;
321 if (newval == FALSE) {
322 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
323 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
324 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
326 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
327 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
328 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
333 * The nbmand mount option can be changed at mount time.
334 * We can't allow it to be toggled on live file systems or incorrect
335 * behavior may be seen from cifs clients
337 * This property isn't registered via dsl_prop_register(), but this callback
338 * will be called when a file system is first mounted
341 nbmand_changed_cb(void *arg, uint64_t newval)
343 zfsvfs_t *zfsvfs = arg;
344 if (newval == FALSE) {
345 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
346 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
348 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
349 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
354 snapdir_changed_cb(void *arg, uint64_t newval)
356 zfsvfs_t *zfsvfs = arg;
358 zfsvfs->z_show_ctldir = newval;
362 vscan_changed_cb(void *arg, uint64_t newval)
364 zfsvfs_t *zfsvfs = arg;
366 zfsvfs->z_vscan = newval;
370 acl_mode_changed_cb(void *arg, uint64_t newval)
372 zfsvfs_t *zfsvfs = arg;
374 zfsvfs->z_acl_mode = newval;
378 acl_inherit_changed_cb(void *arg, uint64_t newval)
380 zfsvfs_t *zfsvfs = arg;
382 zfsvfs->z_acl_inherit = newval;
386 zfs_register_callbacks(vfs_t *vfsp)
388 struct dsl_dataset *ds = NULL;
390 zfsvfs_t *zfsvfs = NULL;
392 int readonly, do_readonly = B_FALSE;
393 int setuid, do_setuid = B_FALSE;
394 int exec, do_exec = B_FALSE;
395 int devices, do_devices = B_FALSE;
396 int xattr, do_xattr = B_FALSE;
397 int atime, do_atime = B_FALSE;
401 zfsvfs = vfsp->vfs_data;
406 * The act of registering our callbacks will destroy any mount
407 * options we may have. In order to enable temporary overrides
408 * of mount options, we stash away the current values and
409 * restore them after we register the callbacks.
411 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
413 do_readonly = B_TRUE;
414 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
416 do_readonly = B_TRUE;
418 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
424 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
427 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
432 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
435 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
440 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
443 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
447 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
450 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
454 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
457 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
463 * nbmand is a special property. It can only be changed at
466 * This is weird, but it is documented to only be changeable
469 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
471 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
474 char osname[MAXNAMELEN];
476 dmu_objset_name(os, osname);
477 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
484 * Register property callbacks.
486 * It would probably be fine to just check for i/o error from
487 * the first prop_register(), but I guess I like to go
490 ds = dmu_objset_ds(os);
491 error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
492 error = error ? error : dsl_prop_register(ds,
493 "xattr", xattr_changed_cb, zfsvfs);
494 error = error ? error : dsl_prop_register(ds,
495 "recordsize", blksz_changed_cb, zfsvfs);
496 error = error ? error : dsl_prop_register(ds,
497 "readonly", readonly_changed_cb, zfsvfs);
498 error = error ? error : dsl_prop_register(ds,
499 "devices", devices_changed_cb, zfsvfs);
500 error = error ? error : dsl_prop_register(ds,
501 "setuid", setuid_changed_cb, zfsvfs);
502 error = error ? error : dsl_prop_register(ds,
503 "exec", exec_changed_cb, zfsvfs);
504 error = error ? error : dsl_prop_register(ds,
505 "snapdir", snapdir_changed_cb, zfsvfs);
506 error = error ? error : dsl_prop_register(ds,
507 "aclmode", acl_mode_changed_cb, zfsvfs);
508 error = error ? error : dsl_prop_register(ds,
509 "aclinherit", acl_inherit_changed_cb, zfsvfs);
510 error = error ? error : dsl_prop_register(ds,
511 "vscan", vscan_changed_cb, zfsvfs);
516 * Invoke our callbacks to restore temporary mount options.
519 readonly_changed_cb(zfsvfs, readonly);
521 setuid_changed_cb(zfsvfs, setuid);
523 exec_changed_cb(zfsvfs, exec);
525 devices_changed_cb(zfsvfs, devices);
527 xattr_changed_cb(zfsvfs, xattr);
529 atime_changed_cb(zfsvfs, atime);
531 nbmand_changed_cb(zfsvfs, nbmand);
537 * We may attempt to unregister some callbacks that are not
538 * registered, but this is OK; it will simply return ENOMSG,
539 * which we will ignore.
541 (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
542 (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
543 (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
544 (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
545 (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
546 (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
547 (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
548 (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
549 (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
550 (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
552 (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
558 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
562 error = zfs_register_callbacks(zfsvfs->z_vfs);
567 * Set the objset user_ptr to track its zfsvfs.
569 mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
570 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
571 mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
574 * If we are not mounting (ie: online recv), then we don't
575 * have to worry about replaying the log as we blocked all
576 * operations out since we closed the ZIL.
582 * During replay we remove the read only flag to
583 * allow replays to succeed.
585 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
586 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
589 * Parse and replay the intent log.
591 zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
592 zfs_replay_vector, zfs_unlinked_drain);
594 zfs_unlinked_drain(zfsvfs);
595 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
599 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
605 zfs_freezfsvfs(zfsvfs_t *zfsvfs)
607 mutex_destroy(&zfsvfs->z_znodes_lock);
608 mutex_destroy(&zfsvfs->z_online_recv_lock);
609 list_destroy(&zfsvfs->z_all_znodes);
610 rrw_destroy(&zfsvfs->z_teardown_lock);
611 rw_destroy(&zfsvfs->z_teardown_inactive_lock);
612 rw_destroy(&zfsvfs->z_fuid_lock);
613 kmem_free(zfsvfs, sizeof (zfsvfs_t));
617 zfs_domount(vfs_t *vfsp, char *osname)
620 uint64_t recordsize, readonly;
630 * Initialize the zfs-specific filesystem structure.
631 * Should probably make this a kmem cache, shuffle fields,
632 * and just bzero up to z_hold_mtx[].
634 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
635 zfsvfs->z_vfs = vfsp;
636 zfsvfs->z_parent = zfsvfs;
637 zfsvfs->z_assign = TXG_NOWAIT;
638 zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
639 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
641 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
642 mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
643 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
644 offsetof(znode_t, z_link_node));
645 rrw_init(&zfsvfs->z_teardown_lock);
646 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
647 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
649 /* Initialize the generic filesystem structure. */
650 vfsp->vfs_bcount = 0;
651 vfsp->vfs_data = NULL;
653 if (zfs_create_unique_device(&mount_dev) == -1) {
657 ASSERT(vfs_devismounted(mount_dev) == 0);
659 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
663 vfsp->vfs_dev = mount_dev;
664 vfsp->vfs_fstype = zfsfstype;
665 vfsp->vfs_bsize = recordsize;
666 vfsp->vfs_flag |= VFS_NOTRUNC;
667 vfsp->vfs_data = zfsvfs;
669 if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
672 mode = DS_MODE_OWNER;
674 mode |= DS_MODE_READONLY;
676 error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
677 if (error == EROFS) {
678 mode = DS_MODE_OWNER | DS_MODE_READONLY;
679 error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
686 if (error = zfs_init_fs(zfsvfs, &zp))
689 /* The call to zfs_init_fs leaves the vnode held, release it here. */
693 * Set features for file system.
695 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
696 if (zfsvfs->z_use_fuids) {
697 vfs_set_feature(vfsp, VFSFT_XVATTR);
698 vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
699 vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
700 vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
702 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
703 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
704 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
705 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
706 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
707 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
708 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
711 if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
714 ASSERT(mode & DS_MODE_READONLY);
715 atime_changed_cb(zfsvfs, B_FALSE);
716 readonly_changed_cb(zfsvfs, B_TRUE);
717 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
719 xattr_changed_cb(zfsvfs, pval);
720 zfsvfs->z_issnap = B_TRUE;
722 error = zfsvfs_setup(zfsvfs, B_TRUE);
725 if (!zfsvfs->z_issnap)
726 zfsctl_create(zfsvfs);
730 dmu_objset_close(zfsvfs->z_os);
731 zfs_freezfsvfs(zfsvfs);
733 atomic_add_32(&zfs_active_fs_count, 1);
740 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
742 objset_t *os = zfsvfs->z_os;
743 struct dsl_dataset *ds;
746 * Unregister properties.
748 if (!dmu_objset_is_snapshot(os)) {
749 ds = dmu_objset_ds(os);
750 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
753 VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
756 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
759 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
762 VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
765 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
768 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
771 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
774 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
777 VERIFY(dsl_prop_unregister(ds, "aclinherit",
778 acl_inherit_changed_cb, zfsvfs) == 0);
780 VERIFY(dsl_prop_unregister(ds, "vscan",
781 vscan_changed_cb, zfsvfs) == 0);
786 * Convert a decimal digit string to a uint64_t integer.
789 str_to_uint64(char *str, uint64_t *objnum)
794 if (*str < '0' || *str > '9')
797 num = num*10 + *str++ - '0';
805 * The boot path passed from the boot loader is in the form of
806 * "rootpool-name/root-filesystem-object-number'. Convert this
807 * string to a dataset name: "rootpool-name/root-filesystem-name".
810 zfs_parse_bootfs(char *bpath, char *outpath)
816 if (*bpath == 0 || *bpath == '/')
819 (void) strcpy(outpath, bpath);
821 slashp = strchr(bpath, '/');
823 /* if no '/', just return the pool name */
824 if (slashp == NULL) {
828 /* if not a number, just return the root dataset name */
829 if (str_to_uint64(slashp+1, &objnum)) {
834 error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
841 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
844 static int zfsrootdone = 0;
845 zfsvfs_t *zfsvfs = NULL;
854 * The filesystem that we mount as root is defined in the
855 * boot property "zfs-bootfs" with a format of
856 * "poolname/root-dataset-objnum".
858 if (why == ROOT_INIT) {
862 * the process of doing a spa_load will require the
863 * clock to be set before we could (for example) do
864 * something better by looking at the timestamp on
865 * an uberblock, so just set it to -1.
869 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
870 cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
874 zfs_devid = spa_get_bootprop("diskdevid");
875 error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
877 spa_free_bootprop(zfs_devid);
879 spa_free_bootprop(zfs_bootfs);
880 cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
884 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
885 spa_free_bootprop(zfs_bootfs);
886 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
891 spa_free_bootprop(zfs_bootfs);
893 if (error = vfs_lock(vfsp))
896 if (error = zfs_domount(vfsp, rootfs.bo_name)) {
897 cmn_err(CE_NOTE, "zfs_domount: error %d", error);
901 zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
903 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
904 cmn_err(CE_NOTE, "zfs_zget: error %d", error);
909 mutex_enter(&vp->v_lock);
911 mutex_exit(&vp->v_lock);
915 * Leave rootvp held. The root file system is never unmounted.
918 vfs_add((struct vnode *)0, vfsp,
919 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
923 } else if (why == ROOT_REMOUNT) {
924 readonly_changed_cb(vfsp->vfs_data, B_FALSE);
925 vfsp->vfs_flag |= VFS_REMOUNT;
927 /* refresh mount options */
928 zfs_unregister_callbacks(vfsp->vfs_data);
929 return (zfs_register_callbacks(vfsp));
931 } else if (why == ROOT_UNMOUNT) {
932 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
933 (void) zfs_sync(vfsp, 0, 0);
938 * if "why" is equal to anything else other than ROOT_INIT,
939 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
946 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
951 uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ?
952 UIO_SYSSPACE : UIO_USERSPACE;
955 if (mvp->v_type != VDIR)
958 mutex_enter(&mvp->v_lock);
959 if ((uap->flags & MS_REMOUNT) == 0 &&
960 (uap->flags & MS_OVERLAY) == 0 &&
961 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
962 mutex_exit(&mvp->v_lock);
965 mutex_exit(&mvp->v_lock);
968 * ZFS does not support passing unparsed data in via MS_DATA.
969 * Users should use the MS_OPTIONSTR interface; this means
970 * that all option parsing is already done and the options struct
971 * can be interrogated.
973 if ((uap->flags & MS_DATA) && uap->datalen > 0)
977 * Get the objset name (the "special" mount argument).
979 if (error = pn_get(uap->spec, fromspace, &spn))
982 osname = spn.pn_path;
985 * Check for mount privilege?
987 * If we don't have privilege then see if
988 * we have local permission to allow it
990 error = secpolicy_fs_mount(cr, mvp, vfsp);
992 error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
997 * Make sure user is the owner of the mount point
998 * or has sufficient privileges.
1001 vattr.va_mask = AT_UID;
1003 if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
1007 if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
1008 VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
1013 secpolicy_fs_mount_clearopts(cr, vfsp);
1020 * Refuse to mount a filesystem if we are in a local zone and the
1021 * dataset is not visible.
1023 if (!INGLOBALZONE(curproc) &&
1024 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1030 * When doing a remount, we simply refresh our temporary properties
1031 * according to those options set in the current VFS options.
1033 if (uap->flags & MS_REMOUNT) {
1034 /* refresh mount options */
1035 zfs_unregister_callbacks(vfsp->vfs_data);
1036 error = zfs_register_callbacks(vfsp);
1040 error = zfs_domount(vfsp, osname);
1048 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
1050 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1052 uint64_t refdbytes, availbytes, usedobjs, availobjs;
1056 dmu_objset_space(zfsvfs->z_os,
1057 &refdbytes, &availbytes, &usedobjs, &availobjs);
1060 * The underlying storage pool actually uses multiple block sizes.
1061 * We report the fragsize as the smallest block size we support,
1062 * and we report our blocksize as the filesystem's maximum blocksize.
1064 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
1065 statp->f_bsize = zfsvfs->z_max_blksz;
1068 * The following report "total" blocks of various kinds in the
1069 * file system, but reported in terms of f_frsize - the
1073 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1074 statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
1075 statp->f_bavail = statp->f_bfree; /* no root reservation */
1078 * statvfs() should really be called statufs(), because it assumes
1079 * static metadata. ZFS doesn't preallocate files, so the best
1080 * we can do is report the max that could possibly fit in f_files,
1081 * and that minus the number actually used in f_ffree.
1082 * For f_ffree, report the smaller of the number of object available
1083 * and the number of blocks (each object will take at least a block).
1085 statp->f_ffree = MIN(availobjs, statp->f_bfree);
1086 statp->f_favail = statp->f_ffree; /* no "root reservation" */
1087 statp->f_files = statp->f_ffree + usedobjs;
1089 (void) cmpldev(&d32, vfsp->vfs_dev);
1090 statp->f_fsid = d32;
1093 * We're a zfs filesystem.
1095 (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
1097 statp->f_flag = vf_to_stf(vfsp->vfs_flag);
1099 statp->f_namemax = ZFS_MAXNAMELEN;
1102 * We have all of 32 characters to stuff a string here.
1103 * Is there anything useful we could/should provide?
1105 bzero(statp->f_fstr, sizeof (statp->f_fstr));
1112 zfs_root(vfs_t *vfsp, vnode_t **vpp)
1114 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1120 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1122 *vpp = ZTOV(rootzp);
1129 * Teardown the zfsvfs::z_os.
1131 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
1132 * and 'z_teardown_inactive_lock' held.
1135 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1139 rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1143 * We purge the parent filesystem's vfsp as the parent
1144 * filesystem and all of its snapshots have their vnode's
1145 * v_vfsp set to the parent's filesystem's vfsp. Note,
1146 * 'z_parent' is self referential for non-snapshots.
1148 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1152 * Close the zil. NB: Can't close the zil while zfs_inactive
1153 * threads are blocked as zil_close can call zfs_inactive.
1155 if (zfsvfs->z_log) {
1156 zil_close(zfsvfs->z_log);
1157 zfsvfs->z_log = NULL;
1160 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
1163 * If we are not unmounting (ie: online recv) and someone already
1164 * unmounted this file system while we were doing the switcheroo,
1165 * or a reopen of z_os failed then just bail out now.
1167 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1168 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1169 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1174 * At this point there are no vops active, and any new vops will
1175 * fail with EIO since we have z_teardown_lock for writer (only
1176 * relavent for forced unmount).
1178 * Release all holds on dbufs.
1180 mutex_enter(&zfsvfs->z_znodes_lock);
1181 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1182 zp = list_next(&zfsvfs->z_all_znodes, zp))
1184 ASSERT(ZTOV(zp)->v_count > 0);
1185 zfs_znode_dmu_fini(zp);
1187 mutex_exit(&zfsvfs->z_znodes_lock);
1190 * If we are unmounting, set the unmounted flag and let new vops
1191 * unblock. zfs_inactive will have the unmounted behavior, and all
1192 * other vops will fail with EIO.
1195 zfsvfs->z_unmounted = B_TRUE;
1196 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1197 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1201 * z_os will be NULL if there was an error in attempting to reopen
1202 * zfsvfs, so just return as the properties had already been
1203 * unregistered and cached data had been evicted before.
1205 if (zfsvfs->z_os == NULL)
1209 * Unregister properties.
1211 zfs_unregister_callbacks(zfsvfs);
1216 if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
1217 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1218 (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
1226 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
1228 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1232 ret = secpolicy_fs_unmount(cr, vfsp);
1234 ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1235 ZFS_DELEG_PERM_MOUNT, cr);
1241 * We purge the parent filesystem's vfsp as the parent filesystem
1242 * and all of its snapshots have their vnode's v_vfsp set to the
1243 * parent's filesystem's vfsp. Note, 'z_parent' is self
1244 * referential for non-snapshots.
1246 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1249 * Unmount any snapshots mounted under .zfs before unmounting the
1252 if (zfsvfs->z_ctldir != NULL &&
1253 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
1257 if (!(fflag & MS_FORCE)) {
1259 * Check the number of active vnodes in the file system.
1260 * Our count is maintained in the vfs structure, but the
1261 * number is off by 1 to indicate a hold on the vfs
1264 * The '.zfs' directory maintains a reference of its
1265 * own, and any active references underneath are
1266 * reflected in the vnode count.
1268 if (zfsvfs->z_ctldir == NULL) {
1269 if (vfsp->vfs_count > 1)
1272 if (vfsp->vfs_count > 2 ||
1273 zfsvfs->z_ctldir->v_count > 1)
1278 vfsp->vfs_flag |= VFS_UNMOUNTED;
1280 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1284 * z_os will be NULL if there was an error in
1285 * attempting to reopen zfsvfs.
1289 * Unset the objset user_ptr.
1291 mutex_enter(&os->os->os_user_ptr_lock);
1292 dmu_objset_set_user(os, NULL);
1293 mutex_exit(&os->os->os_user_ptr_lock);
1296 * Finally release the objset
1298 dmu_objset_close(os);
1302 * We can now safely destroy the '.zfs' directory node.
1304 if (zfsvfs->z_ctldir != NULL)
1305 zfsctl_destroy(zfsvfs);
1311 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
1313 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1315 uint64_t object = 0;
1316 uint64_t fid_gen = 0;
1325 if (fidp->fid_len == LONG_FID_LEN) {
1326 zfid_long_t *zlfid = (zfid_long_t *)fidp;
1327 uint64_t objsetid = 0;
1328 uint64_t setgen = 0;
1330 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1331 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1333 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1334 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1338 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1344 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1345 zfid_short_t *zfid = (zfid_short_t *)fidp;
1347 for (i = 0; i < sizeof (zfid->zf_object); i++)
1348 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1350 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1351 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1357 /* A zero fid_gen means we are in the .zfs control directories */
1359 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
1360 *vpp = zfsvfs->z_ctldir;
1361 ASSERT(*vpp != NULL);
1362 if (object == ZFSCTL_INO_SNAPDIR) {
1363 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
1364 0, NULL, NULL, NULL, NULL, NULL) == 0);
1372 gen_mask = -1ULL >> (64 - 8 * i);
1374 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
1375 if (err = zfs_zget(zfsvfs, object, &zp)) {
1379 zp_gen = zp->z_phys->zp_gen & gen_mask;
1382 if (zp->z_unlinked || zp_gen != fid_gen) {
1383 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
1395 * Block out VOPs and close zfsvfs_t::z_os
1397 * Note, if successful, then we return with the 'z_teardown_lock' and
1398 * 'z_teardown_inactive_lock' write held.
1401 zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
1405 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1408 *mode = zfsvfs->z_os->os_mode;
1409 dmu_objset_name(zfsvfs->z_os, name);
1410 dmu_objset_close(zfsvfs->z_os);
1416 * Reopen zfsvfs_t::z_os and release VOPs.
1419 zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
1423 ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
1424 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
1426 err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
1428 zfsvfs->z_os = NULL;
1432 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
1435 * Attempt to re-establish all the active znodes with
1436 * their dbufs. If a zfs_rezget() fails, then we'll let
1437 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
1438 * when they try to use their znode.
1440 mutex_enter(&zfsvfs->z_znodes_lock);
1441 for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1442 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1443 (void) zfs_rezget(zp);
1445 mutex_exit(&zfsvfs->z_znodes_lock);
1449 /* release the VOPs */
1450 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1451 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1455 * Since we couldn't reopen zfsvfs::z_os, force
1456 * unmount this file system.
1458 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
1459 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
1465 zfs_freevfs(vfs_t *vfsp)
1467 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1470 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1471 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1473 zfs_fuid_destroy(zfsvfs);
1474 zfs_freezfsvfs(zfsvfs);
1476 atomic_add_32(&zfs_active_fs_count, -1);
1480 * VFS_INIT() initialization. Note that there is no VFS_FINI(),
1481 * so we can't safely do any non-idempotent initialization here.
1482 * Leave that to zfs_init() and zfs_fini(), which are called
1483 * from the module's _init() and _fini() entry points.
1487 zfs_vfsinit(int fstype, char *name)
1494 * Setup vfsops and vnodeops tables.
1496 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
1498 cmn_err(CE_WARN, "zfs: bad vfs ops template");
1501 error = zfs_create_op_tables();
1503 zfs_remove_op_tables();
1504 cmn_err(CE_WARN, "zfs: bad vnode ops template");
1505 (void) vfs_freevfsops_by_type(zfsfstype);
1509 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
1512 * Unique major number for all zfs mounts.
1513 * If we run out of 32-bit minors, we'll getudev() another major.
1515 zfs_major = ddi_name_to_major(ZFS_DRIVER);
1516 zfs_minor = ZFS_MIN_MINOR;
1525 * Initialize .zfs directory structures
1530 * Initialize znode cache, vnode ops, etc...
1545 return (zfs_active_fs_count != 0);
1549 zfs_set_version(const char *name, uint64_t newvers)
1557 * XXX for now, require that the filesystem be unmounted. Would
1558 * be nice to find the zfsvfs_t and just update that if
1562 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
1565 error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os);
1569 error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
1573 if (newvers < curvers) {
1578 tx = dmu_tx_create(os);
1579 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR);
1580 error = dmu_tx_assign(tx, TXG_WAIT);
1585 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
1588 spa_history_internal_log(LOG_DS_UPGRADE,
1589 dmu_objset_spa(os), tx, CRED(),
1590 "oldver=%llu newver=%llu dataset = %llu", curvers, newvers,
1595 dmu_objset_close(os);
1600 * Read a property stored within the master node.
1603 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
1609 * Look up the file system's value for the property. For the
1610 * version property, we look up a slightly different string.
1612 if (prop == ZFS_PROP_VERSION)
1613 pname = ZPL_VERSION_STR;
1615 pname = zfs_prop_to_name(prop);
1618 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
1620 if (error == ENOENT) {
1621 /* No value set, use the default value */
1623 case ZFS_PROP_VERSION:
1624 *value = ZPL_VERSION;
1626 case ZFS_PROP_NORMALIZE:
1627 case ZFS_PROP_UTF8ONLY:
1631 *value = ZFS_CASE_SENSITIVE;
1641 static vfsdef_t vfw = {
1645 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
1650 struct modlfs zfs_modlfs = {
1651 &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw