X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fdsl_dataset.c;h=fce6d3c1a969cfcfa3610e6131106beb6ffa69c4;hb=8630650a8d9cfba379a5b73bd95e903d577e0d8d;hp=ddd83576c65e8fbc69cb6b6eba180bc89cc0f66f;hpb=428870ff734fdaccc342b33fc53cf94724409a46;p=zfs.git diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index ddd8357..fce6d3c 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include @@ -37,15 +38,11 @@ #include #include #include +#include #include #include #include -/* - * Enable/disable prefetching of dedup-ed blocks which are going to be freed. - */ -int zfs_dedup_prefetch = 1; - static char *dsl_reaper = "the grim reaper"; static dsl_checkfunc_t dsl_dataset_destroy_begin_check; @@ -89,11 +86,13 @@ parent_delta(dsl_dataset_t *ds, int64_t delta) void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { - int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); - int compressed = BP_GET_PSIZE(bp); - int uncompressed = BP_GET_UCSIZE(bp); + int used, compressed, uncompressed; int64_t delta; + used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); + compressed = BP_GET_PSIZE(bp); + uncompressed = BP_GET_UCSIZE(bp); + dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); @@ -134,15 +133,17 @@ int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async) { + int used, compressed, uncompressed; + if (BP_IS_HOLE(bp)) return (0); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(bp->blk_birth <= tx->tx_txg); - int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); - int compressed = BP_GET_PSIZE(bp); - int uncompressed = BP_GET_UCSIZE(bp); + used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); + compressed = BP_GET_PSIZE(bp); + uncompressed = BP_GET_UCSIZE(bp); ASSERT(used > 0); if (ds == NULL) { @@ -253,8 +254,7 @@ dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) return (B_FALSE); - if (zfs_dedup_prefetch && bp && BP_GET_DEDUP(bp)) - ddt_prefetch(dsl_dataset_get_spa(ds), bp); + ddt_prefetch(dsl_dataset_get_spa(ds), bp); return (B_TRUE); } @@ -372,6 +372,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, dmu_buf_t *dbuf; dsl_dataset_t *ds; int err; + dmu_object_info_t doi; ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || dsl_pool_sync_context(dp)); @@ -379,19 +380,26 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); if (err) return (err); + + /* Make sure dsobj has the correct object type. */ + dmu_object_info_from_db(dbuf, &doi); + if (doi.doi_type != DMU_OT_DSL_DATASET) + return (EINVAL); + ds = dmu_buf_get_user(dbuf); if (ds == NULL) { - dsl_dataset_t *winner; + dsl_dataset_t *winner = NULL; ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; ds->ds_phys = dbuf->db_data; + list_link_init(&ds->ds_synced_link); mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&ds->ds_rwlock, 0, 0, 0); + rw_init(&ds->ds_rwlock, NULL, RW_DEFAULT, NULL); cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); bplist_create(&ds->ds_pending_deadlist); @@ -881,72 +889,74 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_dir_close(dd, FTAG); - return (dsobj); -} - -struct destroyarg { - dsl_sync_task_group_t *dstg; - char *snapname; - char *failed; - boolean_t defer; -}; - -static int -dsl_snapshot_destroy_one(const char *name, void *arg) -{ - struct destroyarg *da = arg; - dsl_dataset_t *ds; - int err; - char *dsname; + /* + * If we are creating a clone, make sure we zero out any stale + * data from the origin snapshots zil header. + */ + if (origin != NULL) { + dsl_dataset_t *ds; + objset_t *os; - dsname = kmem_asprintf("%s@%s", name, da->snapname); - err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds); - strfree(dsname); - if (err == 0) { - struct dsl_ds_destroyarg *dsda; - - dsl_dataset_make_exclusive(ds, da->dstg); - dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); - dsda->ds = ds; - dsda->defer = da->defer; - dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, dsda, da->dstg, 0); - } else if (err == ENOENT) { - err = 0; - } else { - (void) strcpy(da->failed, name); + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); + bzero(&os->os_zil_header, sizeof (os->os_zil_header)); + dsl_dataset_dirty(ds, tx); + dsl_dataset_rele(ds, FTAG); } - return (err); + + return (dsobj); } /* - * Destroy 'snapname' in all descendants of 'fsname'. + * The snapshots must all be in the same pool. */ -#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy int -dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) +dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed) { int err; - struct destroyarg da; dsl_sync_task_t *dst; spa_t *spa; + nvpair_t *pair; + dsl_sync_task_group_t *dstg; - err = spa_open(fsname, &spa, FTAG); + pair = nvlist_next_nvpair(snaps, NULL); + if (pair == NULL) + return (0); + + err = spa_open(nvpair_name(pair), &spa, FTAG); if (err) return (err); - da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - da.snapname = snapname; - da.failed = fsname; - da.defer = defer; + dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + dsl_dataset_t *ds; + int err; - err = dmu_objset_find(fsname, - dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); + err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); + if (err == 0) { + struct dsl_ds_destroyarg *dsda; + + dsl_dataset_make_exclusive(ds, dstg); + dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), + KM_SLEEP); + dsda->ds = ds; + dsda->defer = defer; + dsl_sync_task_create(dstg, dsl_dataset_destroy_check, + dsl_dataset_destroy_sync, dsda, dstg, 0); + } else if (err == ENOENT) { + err = 0; + } else { + (void) strcpy(failed, nvpair_name(pair)); + break; + } + } if (err == 0) - err = dsl_sync_task_group_wait(da.dstg); + err = dsl_sync_task_group_wait(dstg); - for (dst = list_head(&da.dstg->dstg_tasks); dst; - dst = list_next(&da.dstg->dstg_tasks, dst)) { + for (dst = list_head(&dstg->dstg_tasks); dst; + dst = list_next(&dstg->dstg_tasks, dst)) { struct dsl_ds_destroyarg *dsda = dst->dst_arg1; dsl_dataset_t *ds = dsda->ds; @@ -954,17 +964,17 @@ dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) * Return the file system name that triggered the error */ if (dst->dst_err) { - dsl_dataset_name(ds, fsname); - *strchr(fsname, '@') = '\0'; + dsl_dataset_name(ds, failed); } ASSERT3P(dsda->rm_origin, ==, NULL); - dsl_dataset_disown(ds, da.dstg); + dsl_dataset_disown(ds, dstg); kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); } - dsl_sync_task_group_destroy(da.dstg); + dsl_sync_task_group_destroy(dstg); spa_close(spa, FTAG); return (err); + } static boolean_t @@ -1033,7 +1043,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) dsl_dir_t *dd; uint64_t obj; struct dsl_ds_destroyarg dsda = { 0 }; - dsl_dataset_t dummy_ds = { 0 }; + dsl_dataset_t *dummy_ds; dsda.ds = ds; @@ -1053,8 +1063,9 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) } dd = ds->ds_dir; - dummy_ds.ds_dir = dd; - dummy_ds.ds_object = ds->ds_object; + dummy_ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); + dummy_ds->ds_dir = dd; + dummy_ds->ds_object = ds->ds_object; /* * Check for errors and mark this ds as inconsistent, in @@ -1063,11 +1074,11 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, dsl_dataset_destroy_begin_sync, ds, NULL, 0); if (err) - goto out; + goto out_free; err = dmu_objset_from_ds(ds, &os); if (err) - goto out; + goto out_free; /* * remove the objects in open context, so that we won't @@ -1081,11 +1092,16 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) */ (void) dmu_free_object(os, obj); } + if (err != ESRCH) + goto out_free; + + /* + * Only the ZIL knows how to free log blocks. + */ + zil_destroy(dmu_objset_zil(os), B_FALSE); /* - * We need to sync out all in-flight IO before we try to evict - * (the dataset evict func is trying to clear the cached entries - * for this dataset in the ARC). + * Sync out all in-flight IO. */ txg_wait_synced(dd->dd_pool, 0); @@ -1095,23 +1111,19 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) */ if (ds->ds_phys->ds_bp.blk_fill == 0 && dmu_objset_userused_enabled(os)) { - uint64_t count; - + ASSERTV(uint64_t count); ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || count == 0); ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || count == 0); } - if (err != ESRCH) - goto out; - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); rw_exit(&dd->dd_pool->dp_config_rwlock); if (err) - goto out; + goto out_free; /* * Blow away the dsl_dir + head dataset. @@ -1127,7 +1139,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) err = dsl_dataset_origin_rm_prep(&dsda, tag); if (err) { dsl_dir_close(dd, FTAG); - goto out; + goto out_free; } } @@ -1135,7 +1147,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) dsl_sync_task_create(dstg, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, &dsda, tag, 0); dsl_sync_task_create(dstg, dsl_dir_destroy_check, - dsl_dir_destroy_sync, &dummy_ds, FTAG, 0); + dsl_dir_destroy_sync, dummy_ds, FTAG, 0); err = dsl_sync_task_group_wait(dstg); dsl_sync_task_group_destroy(dstg); @@ -1158,6 +1170,9 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) /* if it is successful, dsl_dir_destroy_sync will close the dd */ if (err) dsl_dir_close(dd, FTAG); + +out_free: + kmem_free(dummy_ds, sizeof (dsl_dataset_t)); out: dsl_dataset_disown(ds, tag); return (err); @@ -1356,6 +1371,11 @@ dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, return (0); } +/* + * If you add new checks here, you may need to add + * additional checks to the "temporary" case in + * snapshot_check() in dmu_objset.c. + */ /* ARGSUSED */ int dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) @@ -1467,8 +1487,8 @@ static void remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t count; int err; + ASSERTV(uint64_t count); ASSERT(ds->ds_phys->ds_num_children >= 2); err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); @@ -1597,21 +1617,23 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; dsl_dataset_t *ds_prev = NULL; + boolean_t wont_destroy; uint64_t obj; - ASSERT(ds->ds_owner); + wont_destroy = (dsda->defer && + (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); + + ASSERT(ds->ds_owner || wont_destroy); ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); ASSERT(ds->ds_prev == NULL || ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); - if (dsda->defer) { + if (wont_destroy) { ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); - if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; - return; - } + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; + return; } /* signal any waiters that this dataset is going away */ @@ -1620,11 +1642,6 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) cv_broadcast(&ds->ds_exclusive_cv); mutex_exit(&ds->ds_lock); - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } - /* Remove our reservation */ if (ds->ds_reserved != 0) { dsl_prop_setarg_t psa; @@ -1751,6 +1768,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) if (dsl_dataset_is_snapshot(ds_next)) { dsl_dataset_t *ds_nextnext; + dsl_dataset_t *hds; /* * Update next's unique to include blocks which @@ -1773,7 +1791,6 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) ASSERT3P(ds_next->ds_prev, ==, NULL); /* Collapse range in this head. */ - dsl_dataset_t *hds; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds)); @@ -1850,6 +1867,15 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) } } + /* + * This must be done after the dsl_traverse(), because it will + * re-open the objset. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { /* Erase the link in the dir */ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); @@ -1887,7 +1913,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) "dataset = %llu", ds->ds_object); if (ds->ds_phys->ds_next_clones_obj != 0) { - uint64_t count; + ASSERTV(uint64_t count); ASSERT(0 == zap_count(mos, ds->ds_phys->ds_next_clones_obj, &count) && count == 0); VERIFY(0 == dmu_object_free(mos, @@ -1928,7 +1954,7 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) */ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); - if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE)) + if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) return (ENOSPC); /* @@ -2112,10 +2138,59 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) dmu_objset_sync(ds->ds_objset, zio, tx); } +static void +get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) +{ + uint64_t count = 0; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + nvlist_t *propval; + nvlist_t *val; + + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + /* + * There may me missing entries in ds_next_clones_obj + * due to a bug in a previous version of the code. + * Only trust it if it has the right number of entries. + */ + if (ds->ds_phys->ds_next_clones_obj != 0) { + ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, + &count)); + } + if (count != ds->ds_phys->ds_num_children - 1) { + goto fail; + } + for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *clone; + char buf[ZFS_MAXNAMELEN]; + if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone) != 0) { + goto fail; + } + dsl_dir_name(clone->ds_dir, buf); + VERIFY(nvlist_add_boolean(val, buf) == 0); + dsl_dataset_rele(clone, FTAG); + } + zap_cursor_fini(&zc); + VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); + VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), + propval) == 0); +fail: + nvlist_free(val); + nvlist_free(propval); + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); +} + void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { - uint64_t refd, avail, uobjs, aobjs; + uint64_t refd, avail, uobjs, aobjs, ratio; dsl_dir_stats(ds->ds_dir, nv); @@ -2142,6 +2217,32 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, DS_IS_DEFER_DESTROY(ds) ? 1 : 0); + if (ds->ds_phys->ds_prev_snap_obj != 0) { + uint64_t written, comp, uncomp; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_dataset_t *prev; + int err; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + rw_exit(&dp->dp_config_rwlock); + if (err == 0) { + err = dsl_dataset_space_written(prev, ds, &written, + &comp, &uncomp); + dsl_dataset_rele(prev, FTAG); + if (err == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, + written); + } + } + } + + ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : + (ds->ds_phys->ds_uncompressed_bytes * 100 / + ds->ds_phys->ds_compressed_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); + if (ds->ds_phys->ds_next_snap_obj) { /* * This is a snapshot; override the dd's space used with @@ -2149,10 +2250,9 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) */ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, ds->ds_phys->ds_unique_bytes); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, - ds->ds_phys->ds_compressed_bytes == 0 ? 100 : - (ds->ds_phys->ds_uncompressed_bytes * 100 / - ds->ds_phys->ds_compressed_bytes)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); + + get_clones_stat(ds, nv); } } @@ -2217,15 +2317,28 @@ dsl_dataset_space(dsl_dataset_t *ds, boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; + ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool); ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || dsl_pool_sync_context(dp)); if (ds->ds_prev == NULL) return (B_FALSE); if (ds->ds_phys->ds_bp.blk_birth > - ds->ds_prev->ds_phys->ds_creation_txg) - return (B_TRUE); + ds->ds_prev->ds_phys->ds_creation_txg) { + objset_t *os, *os_prev; + /* + * It may be that only the ZIL differs, because it was + * reset in the head. Don't count that as being + * modified. + */ + if (dmu_objset_from_ds(ds, &os) != 0) + return (B_TRUE); + if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) + return (B_TRUE); + return (bcmp(&os->os_phys->os_meta_dnode, + &os_prev->os_phys->os_meta_dnode, + sizeof (os->os_phys->os_meta_dnode)) != 0); + } return (B_FALSE); } @@ -2475,7 +2588,6 @@ struct promotearg { }; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); -static boolean_t snaplist_unstable(list_t *l); static int dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) @@ -2876,10 +2988,11 @@ dsl_dataset_promote(const char *name, char *conflsnap) dsl_dir_t *dd; dsl_pool_t *dp; dmu_object_info_t doi; - struct promotearg pa = { 0 }; + struct promotearg pa; struct promotenode *snap; int err; + bzero(&pa, sizeof(struct promotearg)); err = dsl_dataset_hold(name, FTAG, &ds); if (err) return (err); @@ -3144,9 +3257,14 @@ dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, ASSERT(clone->ds_owner); ASSERT(origin_head->ds_owner); retry: - /* Need exclusive access for the swap */ - rw_enter(&clone->ds_rwlock, RW_WRITER); - if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { + /* + * Need exclusive access for the swap. If we're swapping these + * datasets back after an error, we already hold the locks. + */ + if (!RW_WRITE_HELD(&clone->ds_rwlock)) + rw_enter(&clone->ds_rwlock, RW_WRITER); + if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && + !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { rw_exit(&clone->ds_rwlock); rw_enter(&origin_head->ds_rwlock, RW_WRITER); if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { @@ -3230,6 +3348,8 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, error = ERESTART; else error = EDQUOT; + + DMU_TX_STAT_BUMP(dmu_tx_quota); } mutex_exit(&ds->ds_lock); @@ -3411,22 +3531,41 @@ dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, return (err); } -struct dsl_ds_holdarg { - dsl_sync_task_group_t *dstg; - char *htag; - char *snapname; - boolean_t recursive; - boolean_t gotone; - boolean_t temphold; - char failed[MAXPATHLEN]; -}; +typedef struct zfs_hold_cleanup_arg { + dsl_pool_t *dp; + uint64_t dsobj; + char htag[MAXNAMELEN]; +} zfs_hold_cleanup_arg_t; + +static void +dsl_dataset_user_release_onexit(void *arg) +{ + zfs_hold_cleanup_arg_t *ca = arg; + + (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, + B_TRUE); + kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); +} + +void +dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, + minor_t minor) +{ + zfs_hold_cleanup_arg_t *ca; + + ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); + ca->dp = ds->ds_dir->dd_pool; + ca->dsobj = ds->ds_object; + (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); + VERIFY3U(0, ==, zfs_onexit_add_cb(minor, + dsl_dataset_user_release_onexit, ca, NULL)); +} /* - * The max length of a temporary tag prefix is the number of hex digits - * required to express UINT64_MAX plus one for the hyphen. + * If you add new checks here, you may need to add + * additional checks to the "temporary" case in + * snapshot_check() in dmu_objset.c. */ -#define MAX_TAG_PREFIX_LEN 17 - static int dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -3461,7 +3600,7 @@ dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) return (error); } -static void +void dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; @@ -3524,13 +3663,41 @@ dsl_dataset_user_hold_one(const char *dsname, void *arg) } int +dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, + boolean_t temphold) +{ + struct dsl_ds_holdarg *ha; + int error; + + ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + ha->htag = htag; + ha->temphold = temphold; + error = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, + ds, ha, 0); + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + + return (error); +} + +int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, - boolean_t recursive, boolean_t temphold) + boolean_t recursive, boolean_t temphold, int cleanup_fd) { struct dsl_ds_holdarg *ha; dsl_sync_task_t *dst; spa_t *spa; int error; + minor_t minor = 0; + + if (cleanup_fd != -1) { + /* Currently we only support cleanup-on-exit of tempholds. */ + if (!temphold) + return (EINVAL); + error = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (error) + return (error); + } ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); @@ -3539,6 +3706,8 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, error = spa_open(dsname, &spa, FTAG); if (error) { kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + if (cleanup_fd != -1) + zfs_onexit_fd_rele(cleanup_fd); return (error); } @@ -3547,6 +3716,7 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, ha->snapname = snapname; ha->recursive = recursive; ha->temphold = temphold; + if (recursive) { error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, ha, DS_FIND_CHILDREN); @@ -3563,6 +3733,12 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, if (dst->dst_err) { dsl_dataset_name(ds, ha->failed); *strchr(ha->failed, '@') = '\0'; + } else if (error == 0 && minor != 0 && temphold) { + /* + * If this hold is to be released upon process exit, + * register that action now. + */ + dsl_register_onexit_hold_cleanup(ds, htag, minor); } dsl_dataset_rele(ds, ha->dstg); } @@ -3574,8 +3750,11 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); dsl_sync_task_group_destroy(ha->dstg); + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); spa_close(spa, FTAG); + if (cleanup_fd != -1) + zfs_onexit_fd_rele(cleanup_fd); return (error); } @@ -3667,11 +3846,6 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) uint64_t refs; int error; - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } - mutex_enter(&ds->ds_lock); ds->ds_userrefs--; refs = ds->ds_userrefs; @@ -3680,6 +3854,11 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) VERIFY(error == 0 || error == ENOENT); zapobj = ds->ds_phys->ds_userrefs_obj; VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); + + spa_history_log_internal(LOG_DS_USER_RELEASE, + dp->dp_spa, tx, "<%s> %lld dataset = %llu", + ra->htag, (longlong_t)refs, dsobj); + if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && DS_IS_DEFER_DESTROY(ds)) { struct dsl_ds_destroyarg dsda = {0}; @@ -3690,10 +3869,6 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) /* We already did the destroy_check */ dsl_dataset_destroy_sync(&dsda, tag, tx); } - - spa_history_log_internal(LOG_DS_USER_RELEASE, - dp->dp_spa, tx, "<%s> %lld dataset = %llu", - ra->htag, (longlong_t)refs, dsobj); } static int @@ -3831,10 +4006,12 @@ top: } /* - * Called at spa_load time to release a stale temporary user hold. + * Called at spa_load time (with retry == B_FALSE) to release a stale + * temporary user hold. Also called by the onexit code (with retry == B_TRUE). */ int -dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag) +dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, + boolean_t retry) { dsl_dataset_t *ds; char *snap; @@ -3842,20 +4019,36 @@ dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag) int namelen; int error; - rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - if (error) - return (error); - namelen = dsl_dataset_namelen(ds)+1; - name = kmem_alloc(namelen, KM_SLEEP); - dsl_dataset_name(ds, name); - dsl_dataset_rele(ds, FTAG); + do { + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + if (error) + return (error); + namelen = dsl_dataset_namelen(ds)+1; + name = kmem_alloc(namelen, KM_SLEEP); + dsl_dataset_name(ds, name); + dsl_dataset_rele(ds, FTAG); + + snap = strchr(name, '@'); + *snap = '\0'; + ++snap; + error = dsl_dataset_user_release(name, snap, htag, B_FALSE); + kmem_free(name, namelen); + + /* + * The object can't have been destroyed because we have a hold, + * but it might have been renamed, resulting in ENOENT. Retry + * if we've been requested to do so. + * + * It would be nice if we could use the dsobj all the way + * through and avoid ENOENT entirely. But we might need to + * unmount the snapshot, and there's currently no way to lookup + * a vfsp using a ZFS object id. + */ + } while ((error == ENOENT) && retry); - snap = strchr(name, '@'); - *snap = '\0'; - ++snap; - return (dsl_dataset_user_release(name, snap, htag, B_FALSE)); + return (error); } int @@ -3889,7 +4082,7 @@ dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) } /* - * Note, this fuction is used as the callback for dmu_objset_find(). We + * Note, this function is used as the callback for dmu_objset_find(). We * always return 0 so that we will continue to find and process * inconsistent datasets, even if we encounter an error trying to * process one of them. @@ -3908,3 +4101,202 @@ dsl_destroy_inconsistent(const char *dsname, void *arg) } return (0); } + + +/* + * Return (in *usedp) the amount of space written in new that is not + * present in oldsnap. New may be a snapshot or the head. Old must be + * a snapshot before new, in new's filesystem (or its origin). If not then + * fail and return EINVAL. + * + * The written space is calculated by considering two components: First, we + * ignore any freed space, and calculate the written as new's used space + * minus old's used space. Next, we add in the amount of space that was freed + * between the two snapshots, thus reducing new's used space relative to old's. + * Specifically, this is the space that was born before old->ds_creation_txg, + * and freed before new (ie. on new's deadlist or a previous deadlist). + * + * space freed [---------------------] + * snapshots ---O-------O--------O-------O------ + * oldsnap new + */ +int +dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err = 0; + uint64_t snapobj; + dsl_pool_t *dp = new->ds_dir->dd_pool; + + *usedp = 0; + *usedp += new->ds_phys->ds_used_bytes; + *usedp -= oldsnap->ds_phys->ds_used_bytes; + + *compp = 0; + *compp += new->ds_phys->ds_compressed_bytes; + *compp -= oldsnap->ds_phys->ds_compressed_bytes; + + *uncompp = 0; + *uncompp += new->ds_phys->ds_uncompressed_bytes; + *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + snapobj = new->ds_object; + while (snapobj != oldsnap->ds_object) { + dsl_dataset_t *snap; + uint64_t used, comp, uncomp; + + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); + if (err != 0) + break; + + if (snap->ds_phys->ds_prev_snap_txg == + oldsnap->ds_phys->ds_creation_txg) { + /* + * The blocks in the deadlist can not be born after + * ds_prev_snap_txg, so get the whole deadlist space, + * which is more efficient (especially for old-format + * deadlists). Unfortunately the deadlist code + * doesn't have enough information to make this + * optimization itself. + */ + dsl_deadlist_space(&snap->ds_deadlist, + &used, &comp, &uncomp); + } else { + dsl_deadlist_space_range(&snap->ds_deadlist, + 0, oldsnap->ds_phys->ds_creation_txg, + &used, &comp, &uncomp); + } + *usedp += used; + *compp += comp; + *uncompp += uncomp; + + /* + * If we get to the beginning of the chain of snapshots + * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap + * was not a snapshot of/before new. + */ + snapobj = snap->ds_phys->ds_prev_snap_obj; + dsl_dataset_rele(snap, FTAG); + if (snapobj == 0) { + err = EINVAL; + break; + } + + } + rw_exit(&dp->dp_config_rwlock); + return (err); +} + +/* + * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, + * lastsnap, and all snapshots in between are deleted. + * + * blocks that would be freed [---------------------------] + * snapshots ---O-------O--------O-------O--------O + * firstsnap lastsnap + * + * This is the set of blocks that were born after the snap before firstsnap, + * (birth > firstsnap->prev_snap_txg) and died before the snap after the + * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). + * We calculate this by iterating over the relevant deadlists (from the snap + * after lastsnap, backward to the snap after firstsnap), summing up the + * space on the deadlist that was born after the snap before firstsnap. + */ +int +dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, + dsl_dataset_t *lastsnap, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err = 0; + uint64_t snapobj; + dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; + + ASSERT(dsl_dataset_is_snapshot(firstsnap)); + ASSERT(dsl_dataset_is_snapshot(lastsnap)); + + /* + * Check that the snapshots are in the same dsl_dir, and firstsnap + * is before lastsnap. + */ + if (firstsnap->ds_dir != lastsnap->ds_dir || + firstsnap->ds_phys->ds_creation_txg > + lastsnap->ds_phys->ds_creation_txg) + return (EINVAL); + + *usedp = *compp = *uncompp = 0; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + snapobj = lastsnap->ds_phys->ds_next_snap_obj; + while (snapobj != firstsnap->ds_object) { + dsl_dataset_t *ds; + uint64_t used, comp, uncomp; + + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); + if (err != 0) + break; + + dsl_deadlist_space_range(&ds->ds_deadlist, + firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &used, &comp, &uncomp); + *usedp += used; + *compp += comp; + *uncompp += uncomp; + + snapobj = ds->ds_phys->ds_prev_snap_obj; + ASSERT3U(snapobj, !=, 0); + dsl_dataset_rele(ds, FTAG); + } + rw_exit(&dp->dp_config_rwlock); + return (err); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(dmu_snapshots_destroy_nvl); +EXPORT_SYMBOL(dsl_dataset_hold); +EXPORT_SYMBOL(dsl_dataset_hold_obj); +EXPORT_SYMBOL(dsl_dataset_own); +EXPORT_SYMBOL(dsl_dataset_own_obj); +EXPORT_SYMBOL(dsl_dataset_name); +EXPORT_SYMBOL(dsl_dataset_rele); +EXPORT_SYMBOL(dsl_dataset_disown); +EXPORT_SYMBOL(dsl_dataset_drop_ref); +EXPORT_SYMBOL(dsl_dataset_tryown); +EXPORT_SYMBOL(dsl_dataset_make_exclusive); +EXPORT_SYMBOL(dsl_dataset_create_sync); +EXPORT_SYMBOL(dsl_dataset_create_sync_dd); +EXPORT_SYMBOL(dsl_dataset_destroy); +EXPORT_SYMBOL(dsl_dataset_destroy_check); +EXPORT_SYMBOL(dsl_dataset_destroy_sync); +EXPORT_SYMBOL(dsl_dataset_snapshot_check); +EXPORT_SYMBOL(dsl_dataset_snapshot_sync); +EXPORT_SYMBOL(dsl_dataset_rename); +EXPORT_SYMBOL(dsl_dataset_promote); +EXPORT_SYMBOL(dsl_dataset_clone_swap); +EXPORT_SYMBOL(dsl_dataset_user_hold); +EXPORT_SYMBOL(dsl_dataset_user_release); +EXPORT_SYMBOL(dsl_dataset_user_release_tmp); +EXPORT_SYMBOL(dsl_dataset_get_holds); +EXPORT_SYMBOL(dsl_dataset_get_blkptr); +EXPORT_SYMBOL(dsl_dataset_set_blkptr); +EXPORT_SYMBOL(dsl_dataset_get_spa); +EXPORT_SYMBOL(dsl_dataset_modified_since_lastsnap); +EXPORT_SYMBOL(dsl_dataset_space_written); +EXPORT_SYMBOL(dsl_dataset_space_wouldfree); +EXPORT_SYMBOL(dsl_dataset_sync); +EXPORT_SYMBOL(dsl_dataset_block_born); +EXPORT_SYMBOL(dsl_dataset_block_kill); +EXPORT_SYMBOL(dsl_dataset_block_freeable); +EXPORT_SYMBOL(dsl_dataset_prev_snap_txg); +EXPORT_SYMBOL(dsl_dataset_dirty); +EXPORT_SYMBOL(dsl_dataset_stats); +EXPORT_SYMBOL(dsl_dataset_fast_stat); +EXPORT_SYMBOL(dsl_dataset_space); +EXPORT_SYMBOL(dsl_dataset_fsid_guid); +EXPORT_SYMBOL(dsl_dsobj_to_dsname); +EXPORT_SYMBOL(dsl_dataset_check_quota); +EXPORT_SYMBOL(dsl_dataset_set_quota); +EXPORT_SYMBOL(dsl_dataset_set_quota_sync); +EXPORT_SYMBOL(dsl_dataset_set_reservation); +EXPORT_SYMBOL(dsl_destroy_inconsistent); +#endif