X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fdmu_objset.c;h=52d55d5668950d9771e096156771cc1a2fe297a7;hb=refs%2Fheads%2Frertzinger%2Ffeature-zpool-get--p;hp=7981e06825c472b2d3b896d0b48dd1be9f6c9b77;hpb=172bb4bd5e4afef721dd4d2972d8680d983f144b;p=zfs.git diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 7981e06..52d55d5 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -19,10 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ +/* Portions Copyright 2010 Robert Milkowski */ + #include #include #include @@ -36,22 +38,41 @@ #include #include #include -#include #include #include #include #include +#include +#include + +/* + * Needed to close a window in dnode_move() that allows the objset to be freed + * before it can be safely accessed. + */ +krwlock_t os_lock; + +void +dmu_objset_init(void) +{ + rw_init(&os_lock, NULL, RW_DEFAULT, NULL); +} + +void +dmu_objset_fini(void) +{ + rw_destroy(&os_lock); +} spa_t * dmu_objset_spa(objset_t *os) { - return (os->os->os_spa); + return (os->os_spa); } zilog_t * dmu_objset_zil(objset_t *os) { - return (os->os->os_zil); + return (os->os_zil); } dsl_pool_t * @@ -59,82 +80,112 @@ dmu_objset_pool(objset_t *os) { dsl_dataset_t *ds; - if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir) + if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) return (ds->ds_dir->dd_pool); else - return (spa_get_dsl(os->os->os_spa)); + return (spa_get_dsl(os->os_spa)); } dsl_dataset_t * dmu_objset_ds(objset_t *os) { - return (os->os->os_dsl_dataset); + return (os->os_dsl_dataset); } dmu_objset_type_t dmu_objset_type(objset_t *os) { - return (os->os->os_phys->os_type); + return (os->os_phys->os_type); } void dmu_objset_name(objset_t *os, char *buf) { - dsl_dataset_name(os->os->os_dsl_dataset, buf); + dsl_dataset_name(os->os_dsl_dataset, buf); } uint64_t dmu_objset_id(objset_t *os) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = os->os_dsl_dataset; return (ds ? ds->ds_object : 0); } +uint64_t +dmu_objset_syncprop(objset_t *os) +{ + return (os->os_sync); +} + +uint64_t +dmu_objset_logbias(objset_t *os) +{ + return (os->os_logbias); +} + static void checksum_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); - osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); + os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); } static void compression_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval != ZIO_COMPRESS_INHERIT); - osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); + os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); } static void copies_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval > 0); - ASSERT(newval <= spa_max_replication(osi->os_spa)); + ASSERT(newval <= spa_max_replication(os->os_spa)); - osi->os_copies = newval; + os->os_copies = newval; +} + +static void +dedup_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + spa_t *spa = os->os_spa; + enum zio_checksum checksum; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval != ZIO_CHECKSUM_INHERIT); + + checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); + + os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; + os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); } static void primary_cache_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance and range checking should have been done by now. @@ -142,13 +193,13 @@ primary_cache_changed_cb(void *arg, uint64_t newval) ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); - osi->os_primary_cache = newval; + os->os_primary_cache = newval; } static void secondary_cache_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance and range checking should have been done by now. @@ -156,7 +207,35 @@ secondary_cache_changed_cb(void *arg, uint64_t newval) ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); - osi->os_secondary_cache = newval; + os->os_secondary_cache = newval; +} + +static void +sync_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || + newval == ZFS_SYNC_DISABLED); + + os->os_sync = newval; + if (os->os_zil) + zil_set_sync(os->os_zil, newval); +} + +static void +logbias_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + ASSERT(newval == ZFS_LOGBIAS_LATENCY || + newval == ZFS_LOGBIAS_THROUGHPUT); + os->os_logbias = newval; + if (os->os_zil) + zil_set_logbias(os->os_zil, newval); } void @@ -164,58 +243,76 @@ dmu_objset_byteswap(void *buf, size_t size) { objset_phys_t *osp = buf; - ASSERT(size == sizeof (objset_phys_t)); + ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); dnode_byteswap(&osp->os_meta_dnode); byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); osp->os_type = BSWAP_64(osp->os_type); + osp->os_flags = BSWAP_64(osp->os_flags); + if (size == sizeof (objset_phys_t)) { + dnode_byteswap(&osp->os_userused_dnode); + dnode_byteswap(&osp->os_groupused_dnode); + } } int dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - objset_impl_t **osip) + objset_t **osp) { - objset_impl_t *osi; + objset_t *os; int i, err; ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); - osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP); - osi->os.os = osi; - osi->os_dsl_dataset = ds; - osi->os_spa = spa; - osi->os_rootbp = bp; - if (!BP_IS_HOLE(osi->os_rootbp)) { + os = kmem_zalloc(sizeof (objset_t), KM_PUSHPAGE); + os->os_dsl_dataset = ds; + os->os_spa = spa; + os->os_rootbp = bp; + if (!BP_IS_HOLE(os->os_rootbp)) { uint32_t aflags = ARC_WAIT; zbookmark_t zb; - zb.zb_objset = ds ? ds->ds_object : 0; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = 0; - if (DMU_OS_IS_L2CACHEABLE(osi)) + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + + if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_L2CACHE; + if (DMU_OS_IS_L2COMPRESSIBLE(os)) + aflags |= ARC_L2COMPRESS; - dprintf_bp(osi->os_rootbp, "reading %s", ""); - /* - * NB: when bprewrite scrub can change the bp, - * and this is called from dmu_objset_open_ds_os, the bp - * could change, and we'll need a lock. - */ - err = arc_read_nolock(NULL, spa, osi->os_rootbp, - arc_getbuf_func, &osi->os_phys_buf, + dprintf_bp(os->os_rootbp, "reading %s", ""); + err = arc_read(NULL, spa, os->os_rootbp, + arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err) { - kmem_free(osi, sizeof (objset_impl_t)); + kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = EIO; return (err); } - osi->os_phys = osi->os_phys_buf->b_data; + + /* Increase the blocksize if we are permitted. */ + if (spa_version(spa) >= SPA_VERSION_USERSPACE && + arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { + arc_buf_t *buf = arc_buf_alloc(spa, + sizeof (objset_phys_t), &os->os_phys_buf, + ARC_BUFC_METADATA); + bzero(buf->b_data, sizeof (objset_phys_t)); + bcopy(os->os_phys_buf->b_data, buf->b_data, + arc_buf_size(os->os_phys_buf)); + (void) arc_buf_remove_ref(os->os_phys_buf, + &os->os_phys_buf); + os->os_phys_buf = buf; + } + + os->os_phys = os->os_phys_buf->b_data; + os->os_flags = os->os_phys->os_flags; } else { - osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t), - &osi->os_phys_buf, ARC_BUFC_METADATA); - osi->os_phys = osi->os_phys_buf->b_data; - bzero(osi->os_phys, sizeof (objset_phys_t)); + int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? + sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; + os->os_phys_buf = arc_buf_alloc(spa, size, + &os->os_phys_buf, ARC_BUFC_METADATA); + os->os_phys = os->os_phys_buf->b_data; + bzero(os->os_phys, size); } /* @@ -226,173 +323,175 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, */ if (ds) { err = dsl_prop_register(ds, "primarycache", - primary_cache_changed_cb, osi); + primary_cache_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "secondarycache", - secondary_cache_changed_cb, osi); + secondary_cache_changed_cb, os); if (!dsl_dataset_is_snapshot(ds)) { if (err == 0) err = dsl_prop_register(ds, "checksum", - checksum_changed_cb, osi); + checksum_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "compression", - compression_changed_cb, osi); + compression_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "copies", - copies_changed_cb, osi); + copies_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "dedup", + dedup_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "logbias", + logbias_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "sync", + sync_changed_cb, os); } if (err) { - VERIFY(arc_buf_remove_ref(osi->os_phys_buf, - &osi->os_phys_buf) == 1); - kmem_free(osi, sizeof (objset_impl_t)); + VERIFY(arc_buf_remove_ref(os->os_phys_buf, + &os->os_phys_buf) == 1); + kmem_free(os, sizeof (objset_t)); return (err); } } else if (ds == NULL) { /* It's the meta-objset. */ - osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4; - osi->os_compress = ZIO_COMPRESS_LZJB; - osi->os_copies = spa_max_replication(spa); - osi->os_primary_cache = ZFS_CACHE_ALL; - osi->os_secondary_cache = ZFS_CACHE_ALL; + os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; + os->os_compress = ZIO_COMPRESS_LZJB; + os->os_copies = spa_max_replication(spa); + os->os_dedup_checksum = ZIO_CHECKSUM_OFF; + os->os_dedup_verify = 0; + os->os_logbias = 0; + os->os_sync = 0; + os->os_primary_cache = ZFS_CACHE_ALL; + os->os_secondary_cache = ZFS_CACHE_ALL; } - osi->os_zil_header = osi->os_phys->os_zil_header; - osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header); + if (ds == NULL || !dsl_dataset_is_snapshot(ds)) + os->os_zil_header = os->os_phys->os_zil_header; + os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { - list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t), + list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); - list_create(&osi->os_free_dnodes[i], sizeof (dnode_t), + list_create(&os->os_free_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); } - list_create(&osi->os_dnodes, sizeof (dnode_t), + list_create(&os->os_dnodes, sizeof (dnode_t), offsetof(dnode_t, dn_link)); - list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), + list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); - mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - - osi->os_meta_dnode = dnode_special_open(osi, - &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); + mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); + + DMU_META_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, + &os->os_meta_dnode); + if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { + DMU_USERUSED_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, + &os->os_userused_dnode); + DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, + &os->os_groupused_dnode); + } /* * We should be the only thread trying to do this because we * have ds_opening_lock */ if (ds) { - VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi, - dmu_objset_evict)); + mutex_enter(&ds->ds_lock); + ASSERT(ds->ds_objset == NULL); + ds->ds_objset = os; + mutex_exit(&ds->ds_lock); } - *osip = osi; + *osp = os; return (0); } -static int -dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type) +int +dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { - objset_impl_t *osi; + int err = 0; mutex_enter(&ds->ds_opening_lock); - osi = dsl_dataset_get_user_ptr(ds); - if (osi == NULL) { - int err; - + *osp = ds->ds_objset; + if (*osp == NULL) { err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), - ds, &ds->ds_phys->ds_bp, &osi); - if (err) { - mutex_exit(&ds->ds_opening_lock); - return (err); - } + ds, dsl_dataset_get_blkptr(ds), osp); } mutex_exit(&ds->ds_opening_lock); - - os->os = osi; - os->os_mode = DS_MODE_NOHOLD; - - if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) - return (EINVAL); - return (0); + return (err); } +/* called from zpl */ int -dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp) +dmu_objset_hold(const char *name, void *tag, objset_t **osp) { - objset_t *os; + dsl_dataset_t *ds; int err; - os = kmem_alloc(sizeof (objset_t), KM_SLEEP); - err = dmu_objset_open_ds_os(ds, os, type); + err = dsl_dataset_hold(name, tag, &ds); if (err) - kmem_free(os, sizeof (objset_t)); - else - *osp = os; + return (err); + + err = dmu_objset_from_ds(ds, osp); + if (err) + dsl_dataset_rele(ds, tag); + return (err); } /* called from zpl */ int -dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp) +dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp) { - objset_t *os; dsl_dataset_t *ds; int err; - ASSERT(DS_MODE_TYPE(mode) == DS_MODE_USER || - DS_MODE_TYPE(mode) == DS_MODE_OWNER); - - os = kmem_alloc(sizeof (objset_t), KM_SLEEP); - if (DS_MODE_TYPE(mode) == DS_MODE_USER) - err = dsl_dataset_hold(name, os, &ds); - else - err = dsl_dataset_own(name, mode, os, &ds); - if (err) { - kmem_free(os, sizeof (objset_t)); + err = dsl_dataset_own(name, B_FALSE, tag, &ds); + if (err) return (err); - } - err = dmu_objset_open_ds_os(ds, os, type); + err = dmu_objset_from_ds(ds, osp); if (err) { - if (DS_MODE_TYPE(mode) == DS_MODE_USER) - dsl_dataset_rele(ds, os); - else - dsl_dataset_disown(ds, os); - kmem_free(os, sizeof (objset_t)); - } else { - os->os_mode = mode; - *osp = os; + dsl_dataset_disown(ds, tag); + } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { + dmu_objset_disown(*osp, tag); + return (EINVAL); + } else if (!readonly && dsl_dataset_is_snapshot(ds)) { + dmu_objset_disown(*osp, tag); + return (EROFS); } return (err); } void -dmu_objset_close(objset_t *os) +dmu_objset_rele(objset_t *os, void *tag) { - ASSERT(DS_MODE_TYPE(os->os_mode) == DS_MODE_USER || - DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER || - DS_MODE_TYPE(os->os_mode) == DS_MODE_NOHOLD); + dsl_dataset_rele(os->os_dsl_dataset, tag); +} - if (DS_MODE_TYPE(os->os_mode) == DS_MODE_USER) - dsl_dataset_rele(os->os->os_dsl_dataset, os); - else if (DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER) - dsl_dataset_disown(os->os->os_dsl_dataset, os); - kmem_free(os, sizeof (objset_t)); +void +dmu_objset_disown(objset_t *os, void *tag) +{ + dsl_dataset_disown(os->os_dsl_dataset, tag); } int dmu_objset_evict_dbufs(objset_t *os) { - objset_impl_t *osi = os->os; dnode_t *dn; - mutex_enter(&osi->os_lock); + mutex_enter(&os->os_lock); /* process the mdn last, since the other dnodes have holds on it */ - list_remove(&osi->os_dnodes, osi->os_meta_dnode); - list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode); + list_remove(&os->os_dnodes, DMU_META_DNODE(os)); + list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); /* * Find the first dnode with holds. We have to do this dance @@ -400,91 +499,115 @@ dmu_objset_evict_dbufs(objset_t *os) * hold. If there are no holds then it has no dbufs so OK to * skip. */ - for (dn = list_head(&osi->os_dnodes); + for (dn = list_head(&os->os_dnodes); dn && !dnode_add_ref(dn, FTAG); - dn = list_next(&osi->os_dnodes, dn)) + dn = list_next(&os->os_dnodes, dn)) continue; while (dn) { dnode_t *next_dn = dn; do { - next_dn = list_next(&osi->os_dnodes, next_dn); + next_dn = list_next(&os->os_dnodes, next_dn); } while (next_dn && !dnode_add_ref(next_dn, FTAG)); - mutex_exit(&osi->os_lock); + mutex_exit(&os->os_lock); dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); - mutex_enter(&osi->os_lock); + mutex_enter(&os->os_lock); dn = next_dn; } - mutex_exit(&osi->os_lock); - return (list_head(&osi->os_dnodes) != osi->os_meta_dnode); + dn = list_head(&os->os_dnodes); + mutex_exit(&os->os_lock); + return (dn != DMU_META_DNODE(os)); } void -dmu_objset_evict(dsl_dataset_t *ds, void *arg) +dmu_objset_evict(objset_t *os) { - objset_impl_t *osi = arg; - objset_t os; - int i; + dsl_dataset_t *ds = os->os_dsl_dataset; + int t; - for (i = 0; i < TXG_SIZE; i++) { - ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL); - ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL); - } + for (t = 0; t < TXG_SIZE; t++) + ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { if (!dsl_dataset_is_snapshot(ds)) { VERIFY(0 == dsl_prop_unregister(ds, "checksum", - checksum_changed_cb, osi)); + checksum_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "compression", - compression_changed_cb, osi)); + compression_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "copies", - copies_changed_cb, osi)); + copies_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "dedup", + dedup_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "logbias", + logbias_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "sync", + sync_changed_cb, os)); } VERIFY(0 == dsl_prop_unregister(ds, "primarycache", - primary_cache_changed_cb, osi)); + primary_cache_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", - secondary_cache_changed_cb, osi)); + secondary_cache_changed_cb, os)); } + if (os->os_sa) + sa_tear_down(os); + /* * We should need only a single pass over the dnode list, since * nothing can be added to the list at this point. */ - os.os = osi; - (void) dmu_objset_evict_dbufs(&os); + (void) dmu_objset_evict_dbufs(os); + + dnode_special_close(&os->os_meta_dnode); + if (DMU_USERUSED_DNODE(os)) { + dnode_special_close(&os->os_userused_dnode); + dnode_special_close(&os->os_groupused_dnode); + } + zil_free(os->os_zil); - ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode); - ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode); - ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL); + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); - dnode_special_close(osi->os_meta_dnode); - zil_free(osi->os_zil); + VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); + + /* + * This is a barrier to prevent the objset from going away in + * dnode_move() until we can safely ensure that the objset is still in + * use. We consider the objset valid before the barrier and invalid + * after the barrier. + */ + rw_enter(&os_lock, RW_READER); + rw_exit(&os_lock); - VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); - mutex_destroy(&osi->os_lock); - mutex_destroy(&osi->os_obj_lock); - mutex_destroy(&osi->os_user_ptr_lock); - kmem_free(osi, sizeof (objset_impl_t)); + mutex_destroy(&os->os_lock); + mutex_destroy(&os->os_obj_lock); + mutex_destroy(&os->os_user_ptr_lock); + kmem_free(os, sizeof (objset_t)); +} + +timestruc_t +dmu_objset_snap_cmtime(objset_t *os) +{ + return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); } /* called from dsl for meta-objset */ -objset_impl_t * +objset_t * dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx) { - objset_impl_t *osi; + objset_t *os; dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); - if (ds) - mutex_enter(&ds->ds_opening_lock); - VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi)); - if (ds) - mutex_exit(&ds->ds_opening_lock); - mdn = osi->os_meta_dnode; + if (ds != NULL) + VERIFY(0 == dmu_objset_from_ds(ds, &os)); + else + VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); + + mdn = DMU_META_DNODE(os); dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); @@ -519,20 +642,25 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ASSERT(type != DMU_OST_NONE); ASSERT(type != DMU_OST_ANY); ASSERT(type < DMU_OST_NUMTYPES); - osi->os_phys->os_type = type; + os->os_phys->os_type = type; + if (dmu_objset_userused_enabled(os)) { + os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + os->os_flags = os->os_phys->os_flags; + } dsl_dataset_dirty(ds, tx); - return (osi); + return (os); } struct oscarg { void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); void *userarg; - dsl_dataset_t *clone_parent; + dsl_dataset_t *clone_origin; const char *lastname; dmu_objset_type_t type; uint64_t flags; + cred_t *cr; }; /*ARGSUSED*/ @@ -550,17 +678,13 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) if (err != ENOENT) return (err ? err : EEXIST); - if (oa->clone_parent != NULL) { - /* - * You can't clone across pools. - */ - if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool) + if (oa->clone_origin != NULL) { + /* You can't clone across pools. */ + if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) return (EXDEV); - /* - * You can only clone snapshots, not the head datasets. - */ - if (oa->clone_parent->ds_phys->ds_num_children == 0) + /* You can only clone snapshots, not the head datasets. */ + if (!dsl_dataset_is_snapshot(oa->clone_origin)) return (EINVAL); } @@ -568,41 +692,40 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; + spa_t *spa = dd->dd_pool->dp_spa; struct oscarg *oa = arg2; - dsl_dataset_t *ds; - blkptr_t *bp; - uint64_t dsobj; + uint64_t obj; ASSERT(dmu_tx_is_syncing(tx)); - dsobj = dsl_dataset_create_sync(dd, oa->lastname, - oa->clone_parent, oa->flags, cr, tx); + obj = dsl_dataset_create_sync(dd, oa->lastname, + oa->clone_origin, oa->flags, oa->cr, tx); + + if (oa->clone_origin == NULL) { + dsl_pool_t *dp = dd->dd_pool; + dsl_dataset_t *ds; + blkptr_t *bp; + objset_t *os; - VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds)); - bp = dsl_dataset_get_blkptr(ds); - if (BP_IS_HOLE(bp)) { - objset_impl_t *osi; + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); + bp = dsl_dataset_get_blkptr(ds); + ASSERT(BP_IS_HOLE(bp)); - /* This is an empty dmu_objset; not a clone. */ - osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds), - ds, bp, oa->type, tx); + os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx); if (oa->userfunc) - oa->userfunc(&osi->os, oa->userarg, cr, tx); + oa->userfunc(os, oa->userarg, oa->cr, tx); + dsl_dataset_rele(ds, FTAG); } - spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa, - tx, cr, "dataset = %llu", dsobj); - - dsl_dataset_rele(ds, FTAG); + spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj); } int -dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, uint64_t flags, +dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) { dsl_dir_t *pdd; @@ -619,24 +742,13 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, return (EEXIST); } - dprintf("name=%s\n", name); - oa.userfunc = func; oa.userarg = arg; oa.lastname = tail; oa.type = type; oa.flags = flags; + oa.cr = CRED(); - if (clone_parent != NULL) { - /* - * You can't clone to a different type. - */ - if (clone_parent->os->os_phys->os_type != type) { - dsl_dir_close(pdd, FTAG); - return (EINVAL); - } - oa.clone_parent = clone_parent->os->os_dsl_dataset; - } err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, dmu_objset_create_sync, pdd, &oa, 5); dsl_dir_close(pdd, FTAG); @@ -644,208 +756,298 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, } int -dmu_objset_destroy(const char *name) +dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) { - objset_t *os; - int error; - - /* - * If it looks like we'll be able to destroy it, and there's - * an unplayed replay log sitting around, destroy the log. - * It would be nicer to do this in dsl_dataset_destroy_sync(), - * but the replay log objset is modified in open context. - */ - error = dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os); - if (error == 0) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; - zil_destroy(dmu_objset_zil(os), B_FALSE); + dsl_dir_t *pdd; + const char *tail; + int err = 0; + struct oscarg oa = { 0 }; - error = dsl_dataset_destroy(ds, os); - /* - * dsl_dataset_destroy() closes the ds. - */ - kmem_free(os, sizeof (objset_t)); + ASSERT(strchr(name, '@') == NULL); + err = dsl_dir_open(name, FTAG, &pdd, &tail); + if (err) + return (err); + if (tail == NULL) { + dsl_dir_close(pdd, FTAG); + return (EEXIST); } - return (error); + oa.lastname = tail; + oa.clone_origin = clone_origin; + oa.flags = flags; + oa.cr = CRED(); + + err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, + dmu_objset_create_sync, pdd, &oa, 5); + dsl_dir_close(pdd, FTAG); + return (err); } -/* - * This will close the objset. - */ int -dmu_objset_rollback(objset_t *os) +dmu_objset_destroy(const char *name, boolean_t defer) { - int err; dsl_dataset_t *ds; + int error; - ds = os->os->os_dsl_dataset; - - if (!dsl_dataset_tryown(ds, TRUE, os)) { - dmu_objset_close(os); - return (EBUSY); + error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); + if (error == 0) { + error = dsl_dataset_destroy(ds, FTAG, defer); + /* dsl_dataset_destroy() closes the ds. */ } - err = dsl_dataset_rollback(ds, os->os->os_phys->os_type); - - /* - * NB: we close the objset manually because the rollback - * actually implicitly called dmu_objset_evict(), thus freeing - * the objset_impl_t. - */ - dsl_dataset_disown(ds, os); - kmem_free(os, sizeof (objset_t)); - return (err); + return (error); } struct snaparg { dsl_sync_task_group_t *dstg; char *snapname; + char *htag; char failed[MAXPATHLEN]; - boolean_t checkperms; - list_t objsets; + boolean_t recursive; + boolean_t needsuspend; + boolean_t temporary; + nvlist_t *props; + struct dsl_ds_holdarg *ha; /* only needed in the temporary case */ + dsl_dataset_t *newds; }; -struct osnode { - list_node_t node; - objset_t *os; -}; +static int +snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + objset_t *os = arg1; + struct snaparg *sn = arg2; + int error; + + /* The props have already been checked by zfs_check_userprops(). */ + + error = dsl_dataset_snapshot_check(os->os_dsl_dataset, + sn->snapname, tx); + if (error) + return (error); + + if (sn->temporary) { + /* + * Ideally we would just call + * dsl_dataset_user_hold_check() and + * dsl_dataset_destroy_check() here. However the + * dataset we want to hold and destroy is the snapshot + * that we just confirmed we can create, but it won't + * exist until after these checks are run. Do any + * checks we can here and if more checks are added to + * those routines in the future, similar checks may be + * necessary here. + */ + if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + /* + * Not checking number of tags because the tag will be + * unique, as it will be the only tag. + */ + if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) + return (E2BIG); + + sn->ha = kmem_alloc(sizeof(struct dsl_ds_holdarg), KM_PUSHPAGE); + sn->ha->temphold = B_TRUE; + sn->ha->htag = sn->htag; + } + return (error); +} + +static void +snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + objset_t *os = arg1; + dsl_dataset_t *ds = os->os_dsl_dataset; + struct snaparg *sn = arg2; + + dsl_dataset_snapshot_sync(ds, sn->snapname, tx); + + if (sn->props) { + dsl_props_arg_t pa; + pa.pa_props = sn->props; + pa.pa_source = ZPROP_SRC_LOCAL; + dsl_props_set_sync(ds->ds_prev, &pa, tx); + } + + if (sn->temporary) { + struct dsl_ds_destroyarg da; + + dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx); + kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg)); + sn->ha = NULL; + sn->newds = ds->ds_prev; + + da.ds = ds->ds_prev; + da.defer = B_TRUE; + dsl_dataset_destroy_sync(&da, FTAG, tx); + } +} static int -dmu_objset_snapshot_one(char *name, void *arg) +dmu_objset_snapshot_one(const char *name, void *arg) { struct snaparg *sn = arg; objset_t *os; int err; + char *cp; + + /* + * If the objset starts with a '%', then ignore it unless it was + * explicitly named (ie, not recursive). These hidden datasets + * are always inconsistent, and by not opening them here, we can + * avoid a race with dsl_dir_destroy_check(). + */ + cp = strrchr(name, '/'); + if (cp && cp[1] == '%' && sn->recursive) + return (0); (void) strcpy(sn->failed, name); /* - * Check permissions only when requested. This only applies when - * doing a recursive snapshot. The permission checks for the starting - * dataset have already been performed in zfs_secpolicy_snapshot() + * Check permissions if we are doing a recursive snapshot. The + * permission checks for the starting dataset have already been + * performed in zfs_secpolicy_snapshot() */ - if (sn->checkperms == B_TRUE && - (err = zfs_secpolicy_snapshot_perms(name, CRED()))) + if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED()))) return (err); - err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_USER, &os); + err = dmu_objset_hold(name, sn, &os); if (err != 0) return (err); - /* If the objset is in an inconsistent state, return busy */ - if (os->os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { - dmu_objset_close(os); - return (EBUSY); - } - /* - * NB: we need to wait for all in-flight changes to get to disk, - * so that we snapshot those changes. zil_suspend does this as - * a side effect. + * If the objset is in an inconsistent state (eg, in the process + * of being destroyed), don't snapshot it. As with %hidden + * datasets, we return EBUSY if this name was explicitly + * requested (ie, not recursive), and otherwise ignore it. */ - err = zil_suspend(dmu_objset_zil(os)); - if (err == 0) { - struct osnode *osn; - dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check, - dsl_dataset_snapshot_sync, os->os->os_dsl_dataset, - sn->snapname, 3); - osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP); - osn->os = os; - list_insert_tail(&sn->objsets, osn); - } else { - dmu_objset_close(os); + if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { + dmu_objset_rele(os, sn); + return (sn->recursive ? 0 : EBUSY); } - return (err); + if (sn->needsuspend) { + err = zil_suspend(dmu_objset_zil(os)); + if (err) { + dmu_objset_rele(os, sn); + return (err); + } + } + dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync, + os, sn, 3); + + return (0); } int -dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) +dmu_objset_snapshot(char *fsname, char *snapname, char *tag, + nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd) { dsl_sync_task_t *dst; - struct osnode *osn; - struct snaparg sn = { 0 }; + struct snaparg *sn; spa_t *spa; + minor_t minor; int err; - (void) strcpy(sn.failed, fsname); + sn = kmem_alloc(sizeof (struct snaparg), KM_SLEEP); + (void) strcpy(sn->failed, fsname); err = spa_open(fsname, &spa, FTAG); - if (err) + if (err) { + kmem_free(sn, sizeof (struct snaparg)); return (err); + } + + if (temporary) { + if (cleanup_fd < 0) { + spa_close(spa, FTAG); + return (EINVAL); + } + if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { + spa_close(spa, FTAG); + return (err); + } + } - sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - sn.snapname = snapname; - list_create(&sn.objsets, sizeof (struct osnode), - offsetof(struct osnode, node)); + sn->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + sn->snapname = snapname; + sn->htag = tag; + sn->props = props; + sn->recursive = recursive; + sn->needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + sn->temporary = temporary; + sn->ha = NULL; + sn->newds = NULL; if (recursive) { - sn.checkperms = B_TRUE; err = dmu_objset_find(fsname, - dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN); + dmu_objset_snapshot_one, sn, DS_FIND_CHILDREN); } else { - sn.checkperms = B_FALSE; - err = dmu_objset_snapshot_one(fsname, &sn); + err = dmu_objset_snapshot_one(fsname, sn); } - if (err) - goto out; - - err = dsl_sync_task_group_wait(sn.dstg); - - for (dst = list_head(&sn.dstg->dstg_tasks); dst; - dst = list_next(&sn.dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; - if (dst->dst_err) - dsl_dataset_name(ds, sn.failed); - } - -out: - while (osn = list_head(&sn.objsets)) { - list_remove(&sn.objsets, osn); - zil_resume(dmu_objset_zil(osn->os)); - dmu_objset_close(osn->os); - kmem_free(osn, sizeof (struct osnode)); + if (err == 0) + err = dsl_sync_task_group_wait(sn->dstg); + + for (dst = list_head(&sn->dstg->dstg_tasks); dst; + dst = list_next(&sn->dstg->dstg_tasks, dst)) { + objset_t *os = dst->dst_arg1; + dsl_dataset_t *ds = os->os_dsl_dataset; + if (dst->dst_err) { + dsl_dataset_name(ds, sn->failed); + } else if (temporary) { + dsl_register_onexit_hold_cleanup(sn->newds, tag, minor); + } + if (sn->needsuspend) + zil_resume(dmu_objset_zil(os)); + dmu_objset_rele(os, sn); } - list_destroy(&sn.objsets); if (err) - (void) strcpy(fsname, sn.failed); - dsl_sync_task_group_destroy(sn.dstg); + (void) strcpy(fsname, sn->failed); + if (temporary) + zfs_onexit_fd_rele(cleanup_fd); + dsl_sync_task_group_destroy(sn->dstg); spa_close(spa, FTAG); + kmem_free(sn, sizeof (struct snaparg)); return (err); } static void -dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) +dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) { dnode_t *dn; - while (dn = list_head(list)) { + while ((dn = list_head(list))) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ASSERT(dn->dn_dbuf->db_data_pending); /* - * Initialize dn_zio outside dnode_sync() - * to accomodate meta-dnode + * Initialize dn_zio outside dnode_sync() because the + * meta-dnode needs to set it ouside dnode_sync(). */ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ASSERT(dn->dn_zio); ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); list_remove(list, dn); + + if (newlist) { + (void) dnode_add_ref(dn, newlist); + list_insert_tail(newlist, dn); + } + dnode_sync(dn, tx); } } /* ARGSUSED */ static void -ready(zio_t *zio, arc_buf_t *abuf, void *arg) +dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) { + int i; + blkptr_t *bp = zio->io_bp; - blkptr_t *bp_orig = &zio->io_bp_orig; - objset_impl_t *os = arg; + objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; ASSERT(bp == os->os_rootbp); @@ -853,31 +1055,45 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg) ASSERT(BP_GET_LEVEL(bp) == 0); /* - * Update rootbp fill count. + * Update rootbp fill count: it should be the number of objects + * allocated in the object set (not counting the "special" + * objects that are stored in the objset_phys_t -- the meta + * dnode and user/group accounting objects). */ - bp->blk_fill = 1; /* count the meta-dnode */ - for (int i = 0; i < dnp->dn_nblkptr; i++) + bp->blk_fill = 0; + for (i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += dnp->dn_blkptr[i].blk_fill; +} + +/* ARGSUSED */ +static void +dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + objset_t *os = arg; if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { - ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); + ASSERT(BP_EQUAL(bp, bp_orig)); } else { - if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) - (void) dsl_dataset_block_kill(os->os_dsl_dataset, - &zio->io_bp_orig, zio, os->os_synctx); - dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx); + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; + + (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, bp, tx); } } /* called from dsl */ void -dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) +dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; zbookmark_t zb; - writeprops_t wp = { 0 }; + zio_prop_t zp; zio_t *zio; list_t *list; + list_t *newlist = NULL; dbuf_dirty_record_t *dr; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); @@ -898,40 +1114,52 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) /* * Create the root block IO */ - zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; - zb.zb_object = 0; - zb.zb_level = -1; /* for block ordering; it's level 0 on disk */ - zb.zb_blkid = 0; - - wp.wp_type = DMU_OT_OBJSET; - wp.wp_level = 0; /* on-disk BP level; see above */ - wp.wp_copies = os->os_copies; - wp.wp_oschecksum = os->os_checksum; - wp.wp_oscompress = os->os_compress; - - if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) { - (void) dsl_dataset_block_kill(os->os_dsl_dataset, - os->os_rootbp, pio, tx); - } - + SET_BOOKMARK(&zb, os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); arc_release(os->os_phys_buf, &os->os_phys_buf); - zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os), - tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + + dmu_write_policy(os, NULL, 0, 0, &zp); + + zio = arc_write(pio, os->os_spa, tx->tx_txg, + os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), + DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, + dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED, &zb); /* - * Sync meta-dnode - the parent IO for the sync is the root block + * Sync special dnodes - the parent IO for the sync is the root block */ - os->os_meta_dnode->dn_zio = zio; - dnode_sync(os->os_meta_dnode, tx); + DMU_META_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_META_DNODE(os), tx); + + os->os_phys->os_flags = os->os_flags; + + if (DMU_USERUSED_DNODE(os) && + DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { + DMU_USERUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_USERUSED_DNODE(os), tx); + DMU_GROUPUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_GROUPUSED_DNODE(os), tx); + } txgoff = tx->tx_txg & TXG_MASK; - dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx); - dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx); + if (dmu_objset_userused_enabled(os)) { + newlist = &os->os_synced_dnodes; + /* + * We must create the list here because it uses the + * dn_dirty_link[] of this txg. + */ + list_create(newlist, sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[txgoff])); + } - list = &os->os_meta_dnode->dn_dirty_records[txgoff]; - while (dr = list_head(list)) { + dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); + dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); + + list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; + while ((dr = list_head(list)) != NULL) { ASSERT(dr->dr_dbuf->db_level == 0); list_remove(list, dr); if (dr->dr_zio) @@ -945,46 +1173,348 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) zio_nowait(zio); } +boolean_t +dmu_objset_is_dirty(objset_t *os, uint64_t txg) +{ + return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || + !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); +} + +static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; + +void +dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) +{ + used_cbs[ost] = cb; +} + +boolean_t +dmu_objset_userused_enabled(objset_t *os) +{ + return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && + used_cbs[os->os_phys->os_type] != NULL && + DMU_USERUSED_DNODE(os) != NULL); +} + +static void +do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, + uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) +{ + if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { + int64_t delta = DNODE_SIZE + used; + if (subtract) + delta = -delta; + VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, + user, delta, tx)); + VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, + group, delta, tx)); + } +} + +void +dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) +{ + dnode_t *dn; + list_t *list = &os->os_synced_dnodes; + + ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); + + while ((dn = list_head(list)) != NULL) { + int flags; + ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); + ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || + dn->dn_phys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED); + + /* Allocate the user/groupused objects if necessary. */ + if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { + VERIFY(0 == zap_create_claim(os, + DMU_USERUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + VERIFY(0 == zap_create_claim(os, + DMU_GROUPUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + } + + /* + * We intentionally modify the zap object even if the + * net delta is zero. Otherwise + * the block of the zap obj could be shared between + * datasets but need to be different between them after + * a bprewrite. + */ + + flags = dn->dn_id_flags; + ASSERT(flags); + if (flags & DN_ID_OLD_EXIST) { + do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, + dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); + } + if (flags & DN_ID_NEW_EXIST) { + do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), + dn->dn_phys->dn_flags, dn->dn_newuid, + dn->dn_newgid, B_FALSE, tx); + } + + mutex_enter(&dn->dn_mtx); + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + if (dn->dn_id_flags & DN_ID_NEW_EXIST) { + dn->dn_olduid = dn->dn_newuid; + dn->dn_oldgid = dn->dn_newgid; + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (dn->dn_bonuslen == 0) + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + else + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); + mutex_exit(&dn->dn_mtx); + + list_remove(list, dn); + dnode_rele(dn, list); + } +} + +/* + * Returns a pointer to data to find uid/gid from + * + * If a dirty record for transaction group that is syncing can't + * be found then NULL is returned. In the NULL case it is assumed + * the uid/gid aren't changing. + */ +static void * +dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr, **drp; + void *data; + + if (db->db_dirtycnt == 0) + return (db->db.db_data); /* Nothing is changing */ + + for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) + if (dr->dr_txg == tx->tx_txg) + break; + + if (dr == NULL) { + data = NULL; + } else { + dnode_t *dn; + + DB_DNODE_ENTER(dr->dr_dbuf); + dn = DB_DNODE(dr->dr_dbuf); + + if (dn->dn_bonuslen == 0 && + dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) + data = dr->dt.dl.dr_data->b_data; + else + data = dr->dt.dl.dr_data; + + DB_DNODE_EXIT(dr->dr_dbuf); + } + + return (data); +} + +void +dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) +{ + objset_t *os = dn->dn_objset; + void *data = NULL; + dmu_buf_impl_t *db = NULL; + uint64_t *user = NULL, *group = NULL; + int flags = dn->dn_id_flags; + int error; + boolean_t have_spill = B_FALSE; + + if (!dmu_objset_userused_enabled(dn->dn_objset)) + return; + + if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| + DN_ID_CHKED_SPILL))) + return; + + if (before && dn->dn_bonuslen != 0) + data = DN_BONUS(dn->dn_phys); + else if (!before && dn->dn_bonuslen != 0) { + if (dn->dn_bonus) { + db = dn->dn_bonus; + mutex_enter(&db->db_mtx); + data = dmu_objset_userquota_find_data(db, tx); + } else { + data = DN_BONUS(dn->dn_phys); + } + } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { + int rf = 0; + + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + error = dmu_spill_hold_by_dnode(dn, + rf | DB_RF_MUST_SUCCEED, + FTAG, (dmu_buf_t **)&db); + ASSERT(error == 0); + mutex_enter(&db->db_mtx); + data = (before) ? db->db.db_data : + dmu_objset_userquota_find_data(db, tx); + have_spill = B_TRUE; + } else { + mutex_enter(&dn->dn_mtx); + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + mutex_exit(&dn->dn_mtx); + return; + } + + if (before) { + ASSERT(data); + user = &dn->dn_olduid; + group = &dn->dn_oldgid; + } else if (data) { + user = &dn->dn_newuid; + group = &dn->dn_newgid; + } + + /* + * Must always call the callback in case the object + * type has changed and that type isn't an object type to track + */ + error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, + user, group); + + /* + * Preserve existing uid/gid when the callback can't determine + * what the new uid/gid are and the callback returned EEXIST. + * The EEXIST error tells us to just use the existing uid/gid. + * If we don't know what the old values are then just assign + * them to 0, since that is a new file being created. + */ + if (!before && data == NULL && error == EEXIST) { + if (flags & DN_ID_OLD_EXIST) { + dn->dn_newuid = dn->dn_olduid; + dn->dn_newgid = dn->dn_oldgid; + } else { + dn->dn_newuid = 0; + dn->dn_newgid = 0; + } + error = 0; + } + + if (db) + mutex_exit(&db->db_mtx); + + mutex_enter(&dn->dn_mtx); + if (error == 0 && before) + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (error == 0 && !before) + dn->dn_id_flags |= DN_ID_NEW_EXIST; + + if (have_spill) { + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + } else { + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + mutex_exit(&dn->dn_mtx); + if (have_spill) + dmu_buf_rele((dmu_buf_t *)db, FTAG); +} + +boolean_t +dmu_objset_userspace_present(objset_t *os) +{ + return (os->os_phys->os_flags & + OBJSET_FLAG_USERACCOUNTING_COMPLETE); +} + +int +dmu_objset_userspace_upgrade(objset_t *os) +{ + uint64_t obj; + int err = 0; + + if (dmu_objset_userspace_present(os)) + return (0); + if (!dmu_objset_userused_enabled(os)) + return (ENOTSUP); + if (dmu_objset_is_snapshot(os)) + return (EINVAL); + + /* + * We simply need to mark every object dirty, so that it will be + * synced out and now accounted. If this is called + * concurrently, or if we already did some work before crashing, + * that's fine, since we track each object's accounted state + * independently. + */ + + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { + dmu_tx_t *tx; + dmu_buf_t *db; + int objerr; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (EINTR); + + objerr = dmu_bonus_hold(os, obj, FTAG, &db); + if (objerr) + continue; + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, obj); + objerr = dmu_tx_assign(tx, TXG_WAIT); + if (objerr) { + dmu_tx_abort(tx); + continue; + } + dmu_buf_will_dirty(db, tx); + dmu_buf_rele(db, FTAG); + dmu_tx_commit(tx); + } + + os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + txg_wait_synced(dmu_objset_pool(os), 0); + return (0); +} + void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { - dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp, + dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, usedobjsp, availobjsp); } uint64_t dmu_objset_fsid_guid(objset_t *os) { - return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset)); + return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); } void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) { - stat->dds_type = os->os->os_phys->os_type; - if (os->os->os_dsl_dataset) - dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat); + stat->dds_type = os->os_phys->os_type; + if (os->os_dsl_dataset) + dsl_dataset_fast_stat(os->os_dsl_dataset, stat); } void dmu_objset_stats(objset_t *os, nvlist_t *nv) { - ASSERT(os->os->os_dsl_dataset || - os->os->os_phys->os_type == DMU_OST_META); + ASSERT(os->os_dsl_dataset || + os->os_phys->os_type == DMU_OST_META); - if (os->os->os_dsl_dataset != NULL) - dsl_dataset_stats(os->os->os_dsl_dataset, nv); + if (os->os_dsl_dataset != NULL) + dsl_dataset_stats(os->os_dsl_dataset, nv); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, - os->os->os_phys->os_type); + os->os_phys->os_type); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, + dmu_objset_userspace_present(os)); } int dmu_objset_is_snapshot(objset_t *os) { - if (os->os->os_dsl_dataset != NULL) - return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset)); + if (os->os_dsl_dataset != NULL) + return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); else return (B_FALSE); } @@ -993,7 +1523,7 @@ int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = os->os_dsl_dataset; uint64_t ignored; if (ds->ds_phys->ds_snapnames_zapobj == 0) @@ -1008,7 +1538,7 @@ int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = os->os_dsl_dataset; zap_cursor_t cursor; zap_attribute_t attr; @@ -1042,15 +1572,21 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name, } int +dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value) +{ + return dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value); +} + +int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp) { - dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir; + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; zap_cursor_t cursor; zap_attribute_t attr; /* there is no next dir on a snapshot! */ - if (os->os->os_dsl_dataset->ds_object != + if (os->os_dsl_dataset->ds_object != dd->dd_phys->dd_head_dataset_obj) return (ENOENT); @@ -1079,7 +1615,7 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, } struct findarg { - int (*func)(char *, void *); + int (*func)(const char *, void *); void *arg; }; @@ -1088,7 +1624,7 @@ static int findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) { struct findarg *fa = arg; - return (fa->func((char *)dsname, fa->arg)); + return (fa->func(dsname, fa->arg)); } /* @@ -1096,7 +1632,8 @@ findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) * Perhaps change all callers to use dmu_objset_find_spa()? */ int -dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags) +dmu_objset_find(char *name, int func(const char *, void *), void *arg, + int flags) { struct findarg fa; fa.func = func; @@ -1133,7 +1670,7 @@ dmu_objset_find_spa(spa_t *spa, const char *name, } thisobj = dd->dd_phys->dd_head_dataset_obj; - attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + attr = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); dp = dd->dd_pool; /* @@ -1147,12 +1684,9 @@ dmu_objset_find_spa(spa_t *spa, const char *name, ASSERT(attr->za_integer_length == sizeof (uint64_t)); ASSERT(attr->za_num_integers == 1); - child = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) strcpy(child, name); - (void) strcat(child, "/"); - (void) strcat(child, attr->za_name); + child = kmem_asprintf("%s/%s", name, attr->za_name); err = dmu_objset_find_spa(spa, child, func, arg, flags); - kmem_free(child, MAXPATHLEN); + strfree(child); if (err) break; } @@ -1186,13 +1720,11 @@ dmu_objset_find_spa(spa_t *spa, const char *name, sizeof (uint64_t)); ASSERT(attr->za_num_integers == 1); - child = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) strcpy(child, name); - (void) strcat(child, "@"); - (void) strcat(child, attr->za_name); + child = kmem_asprintf("%s@%s", + name, attr->za_name); err = func(spa, attr->za_first_integer, child, arg); - kmem_free(child, MAXPATHLEN); + strfree(child); if (err) break; } @@ -1213,16 +1745,87 @@ dmu_objset_find_spa(spa_t *spa, const char *name, return (err); } +/* ARGSUSED */ +int +dmu_objset_prefetch(const char *name, void *arg) +{ + dsl_dataset_t *ds; + + if (dsl_dataset_hold(name, FTAG, &ds)) + return (0); + + if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { + mutex_enter(&ds->ds_opening_lock); + if (ds->ds_objset == NULL) { + uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + zbookmark_t zb; + + SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, + ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + + (void) arc_read(NULL, dsl_dataset_get_spa(ds), + &ds->ds_phys->ds_bp, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &zb); + } + mutex_exit(&ds->ds_opening_lock); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + void dmu_objset_set_user(objset_t *os, void *user_ptr) { - ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); - os->os->os_user_ptr = user_ptr; + ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); + os->os_user_ptr = user_ptr; } void * dmu_objset_get_user(objset_t *os) { - ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); - return (os->os->os_user_ptr); + ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); + return (os->os_user_ptr); } + +#if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(dmu_objset_zil); +EXPORT_SYMBOL(dmu_objset_pool); +EXPORT_SYMBOL(dmu_objset_ds); +EXPORT_SYMBOL(dmu_objset_type); +EXPORT_SYMBOL(dmu_objset_name); +EXPORT_SYMBOL(dmu_objset_hold); +EXPORT_SYMBOL(dmu_objset_own); +EXPORT_SYMBOL(dmu_objset_rele); +EXPORT_SYMBOL(dmu_objset_disown); +EXPORT_SYMBOL(dmu_objset_from_ds); +EXPORT_SYMBOL(dmu_objset_create); +EXPORT_SYMBOL(dmu_objset_clone); +EXPORT_SYMBOL(dmu_objset_destroy); +EXPORT_SYMBOL(dmu_objset_snapshot); +EXPORT_SYMBOL(dmu_objset_stats); +EXPORT_SYMBOL(dmu_objset_fast_stat); +EXPORT_SYMBOL(dmu_objset_spa); +EXPORT_SYMBOL(dmu_objset_space); +EXPORT_SYMBOL(dmu_objset_fsid_guid); +EXPORT_SYMBOL(dmu_objset_find); +EXPORT_SYMBOL(dmu_objset_find_spa); +EXPORT_SYMBOL(dmu_objset_prefetch); +EXPORT_SYMBOL(dmu_objset_byteswap); +EXPORT_SYMBOL(dmu_objset_evict_dbufs); +EXPORT_SYMBOL(dmu_objset_snap_cmtime); + +EXPORT_SYMBOL(dmu_objset_sync); +EXPORT_SYMBOL(dmu_objset_is_dirty); +EXPORT_SYMBOL(dmu_objset_create_impl); +EXPORT_SYMBOL(dmu_objset_open_impl); +EXPORT_SYMBOL(dmu_objset_evict); +EXPORT_SYMBOL(dmu_objset_register_type); +EXPORT_SYMBOL(dmu_objset_do_userquota_updates); +EXPORT_SYMBOL(dmu_objset_userquota_get_ids); +EXPORT_SYMBOL(dmu_objset_userused_enabled); +EXPORT_SYMBOL(dmu_objset_userspace_upgrade); +EXPORT_SYMBOL(dmu_objset_userspace_present); +#endif