From 29809a6cbae9869ca6ee026337981b2c9771650a Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Fri, 14 Dec 2012 16:13:40 -0800 Subject: [PATCH] Illumos #3086: unnecessarily setting DS_FLAG_INCONSISTENT on async 3086 unnecessarily setting DS_FLAG_INCONSISTENT on async destroyed datasets Reviewed by: Christopher Siden Approved by: Eric Schrock References: illumos/illumos-gate@ce636f8b38e8c9ff484e880d9abb27251a882860 illumos changeset: 13776:cd512c80fd75 https://www.illumos.org/issues/3086 Ported-by: Brian Behlendorf --- cmd/ztest/ztest.c | 3 ++ include/sys/dsl_pool.h | 7 ++- include/sys/txg.h | 5 +- include/sys/zil.h | 2 + include/sys/zil_impl.h | 2 + module/zfs/dmu.c | 4 +- module/zfs/dmu_send.c | 7 --- module/zfs/dsl_dataset.c | 108 +++++++++++++++++++------------------------ module/zfs/dsl_dir.c | 10 ++-- module/zfs/dsl_pool.c | 116 +++++++++++++++++++++++++++++++++++------------ module/zfs/txg.c | 4 +- module/zfs/zil.c | 56 +++++++++++++++++++++-- 12 files changed, 209 insertions(+), 115 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 9b7adc7..6df0812 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -2277,6 +2277,7 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; + mutex_enter(&zd->zd_dirobj_lock); (void) rw_enter(&zd->zd_zilog_lock, RW_WRITER); /* zfs_sb_teardown() */ @@ -2287,6 +2288,7 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id) zil_replay(os, zd, ztest_replay_vector); (void) rw_exit(&zd->zd_zilog_lock); + mutex_exit(&zd->zd_dirobj_lock); } /* @@ -5743,6 +5745,7 @@ ztest_freeze(void) */ kernel_init(FREAD | FWRITE); VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + ASSERT(spa_freeze_txg(spa) == UINT64_MAX); VERIFY3U(0, ==, ztest_dataset_open(0)); ztest_dataset_close(0); diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 16fb986..61c91f0 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -90,7 +90,6 @@ typedef struct dsl_pool { /* No lock needed - sync context only */ blkptr_t dp_meta_rootbp; - list_t dp_synced_datasets; hrtime_t dp_read_overhead; uint64_t dp_throughput; /* bytes per millisec */ uint64_t dp_write_limit; @@ -104,6 +103,9 @@ typedef struct dsl_pool { kmutex_t dp_lock; uint64_t dp_space_towrite[TXG_SIZE]; uint64_t dp_tempreserved[TXG_SIZE]; + uint64_t dp_mos_used_delta; + uint64_t dp_mos_compressed_delta; + uint64_t dp_mos_uncompressed_delta; uint64_t dp_txg_history_size; list_t dp_txg_history; @@ -111,6 +113,7 @@ typedef struct dsl_pool { /* Has its own locking */ tx_state_t dp_tx; txg_list_t dp_dirty_datasets; + txg_list_t dp_dirty_zilogs; txg_list_t dp_dirty_dirs; txg_list_t dp_sync_tasks; @@ -150,6 +153,8 @@ int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_mos_diduse_space(dsl_pool_t *dp, + int64_t used, int64_t comp, int64_t uncomp); taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp); diff --git a/include/sys/txg.h b/include/sys/txg.h index 2f87d74..f9d6dd4 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -22,6 +22,9 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ #ifndef _SYS_TXG_H #define _SYS_TXG_H @@ -121,7 +124,7 @@ extern void txg_wait_callbacks(struct dsl_pool *dp); extern void txg_list_create(txg_list_t *tl, size_t offset); extern void txg_list_destroy(txg_list_t *tl); -extern int txg_list_empty(txg_list_t *tl, uint64_t txg); +extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg); extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg); extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); diff --git a/include/sys/zil.h b/include/sys/zil.h index c583887..f786f0c 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -454,6 +455,7 @@ extern void zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]); extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx); extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); +extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 6c37c1a..f5b69b7 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -131,6 +132,7 @@ struct zilog { zil_header_t zl_old_header; /* debugging aid */ uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ uint_t zl_prev_rotor; /* rotor for zl_prev[] */ + txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ }; typedef struct zil_bp_node { diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 5d3f70d..e856356 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1953,15 +1953,15 @@ dmu_init(void) dbuf_init(); zfetch_init(); dmu_tx_init(); - arc_init(); l2arc_init(); + arc_init(); } void dmu_fini(void) { - l2arc_fini(); arc_fini(); + l2arc_fini(); dmu_tx_fini(); zfetch_fini(); dbuf_fini(); diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 97c23cb..0cf3c4a 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1609,13 +1609,6 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc) dsl_dataset_t *ds = drc->drc_logical_ds; int err, myerr; - /* - * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() - * expects it to have a ds_user_ptr (and zil), but clone_swap() - * can close it. - */ - txg_wait_synced(ds->ds_dir->dd_pool, 0); - if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, drc->drc_force); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 872d44a..c5b84a2 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -105,14 +105,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); if (ds == NULL) { - /* - * Account for the meta-objset space in its placeholder - * dsl_dir. - */ - ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, - used, compressed, uncompressed, tx); - dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); + dsl_pool_mos_diduse_space(tx->tx_pool, + used, compressed, uncompressed); return; } dmu_buf_will_dirty(ds->ds_dbuf, tx); @@ -150,15 +144,9 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, ASSERT(used > 0); if (ds == NULL) { - /* - * Account for the meta-objset space in its placeholder - * dataset. - */ dsl_free(tx->tx_pool, tx->tx_txg, bp); - - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, - -used, -compressed, -uncompressed, tx); - dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); + dsl_pool_mos_diduse_space(tx->tx_pool, + -used, -compressed, -uncompressed); return (used); } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); @@ -1074,26 +1062,26 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) dummy_ds->ds_dir = dd; dummy_ds->ds_object = ds->ds_object; - /* - * Check for errors and mark this ds as inconsistent, in - * case we crash while freeing the objects. - */ - err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, - dsl_dataset_destroy_begin_sync, ds, NULL, 0); - if (err) - goto out_free; - - err = dmu_objset_from_ds(ds, &os); - if (err) - goto out_free; - - /* - * If async destruction is not enabled try to remove all objects - * while in the open context so that there is less work to do in - * the syncing context. - */ if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + /* + * Check for errors and mark this ds as inconsistent, in + * case we crash while freeing the objects. + */ + err = dsl_sync_task_do(dd->dd_pool, + dsl_dataset_destroy_begin_check, + dsl_dataset_destroy_begin_sync, ds, NULL, 0); + if (err) + goto out_free; + + err = dmu_objset_from_ds(ds, &os); + if (err) + goto out_free; + + /* + * Remove all objects while in the open context so that + * there is less work to do in the syncing context. + */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, ds->ds_phys->ds_prev_snap_txg)) { /* @@ -1104,29 +1092,25 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) } if (err != ESRCH) goto out_free; - } - /* - * Only the ZIL knows how to free log blocks. - */ - zil_destroy(dmu_objset_zil(os), B_FALSE); - - /* - * Sync out all in-flight IO. - */ - txg_wait_synced(dd->dd_pool, 0); + /* + * Sync out all in-flight IO. + */ + txg_wait_synced(dd->dd_pool, 0); - /* - * If we managed to free all the objects in open - * context, the user space accounting should be zero. - */ - if (ds->ds_phys->ds_bp.blk_fill == 0 && - dmu_objset_userused_enabled(os)) { - ASSERTV(uint64_t count); - ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || - count == 0); - ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || - count == 0); + /* + * If we managed to free all the objects in open + * context, the user space accounting should be zero. + */ + if (ds->ds_phys->ds_bp.blk_fill == 0 && + dmu_objset_userused_enabled(os)) { + ASSERTV(uint64_t count); + + ASSERT(zap_count(os, DMU_USERUSED_OBJECT, + &count) != 0 || count == 0); + ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, + &count) != 0 || count == 0); + } } rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); @@ -1878,6 +1862,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) } else { zfeature_info_t *async_destroy = &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; + objset_t *os; /* * There's no next snapshot, so this is a head dataset. @@ -1889,6 +1874,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); ds->ds_phys->ds_deadlist_obj = 0; + VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); + if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { err = old_synchronous_dataset_destroy(ds, tx); } else { @@ -1898,12 +1885,12 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) */ uint64_t used, comp, uncomp; - ASSERT(err == 0 || err == EBUSY); + zil_destroy_sync(dmu_objset_zil(os), tx); + if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { spa_feature_incr(dp->dp_spa, async_destroy, tx); - dp->dp_bptree_obj = bptree_alloc( - dp->dp_meta_objset, tx); - VERIFY(zap_add(dp->dp_meta_objset, + dp->dp_bptree_obj = bptree_alloc(mos, tx); + VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, &dp->dp_bptree_obj, tx) == 0); @@ -1916,7 +1903,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == used); - bptree_add(dp->dp_meta_objset, dp->dp_bptree_obj, + bptree_add(mos, dp->dp_bptree_obj, &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, used, comp, uncomp, tx); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, @@ -2203,7 +2190,6 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; - dsl_dir_dirty(ds->ds_dir, tx); dmu_objset_sync(ds->ds_objset, zio, tx); } diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 377df40..7412239 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -189,7 +189,6 @@ errout: kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); return (err); - } void @@ -223,7 +222,7 @@ dsl_dir_name(dsl_dir_t *dd, char *buf) } } -/* Calculate name legnth, avoiding all the strcat calls of dsl_dir_name */ +/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ int dsl_dir_namelen(dsl_dir_t *dd) { @@ -592,8 +591,6 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); - dmu_buf_will_dirty(dd->dd_dbuf, tx); - mutex_enter(&dd->dd_lock); ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0); dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, @@ -950,8 +947,6 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, ASSERT(dmu_tx_is_syncing(tx)); ASSERT(type < DD_USED_NUM); - dsl_dir_dirty(dd, tx); - if (needlock) mutex_enter(&dd->dd_lock); accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used); @@ -960,6 +955,7 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, dd->dd_phys->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); + dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_used_bytes += used; dd->dd_phys->dd_uncompressed_bytes += uncompressed; dd->dd_phys->dd_compressed_bytes += compressed; @@ -1003,13 +999,13 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN)) return; - dsl_dir_dirty(dd, tx); if (needlock) mutex_enter(&dd->dd_lock); ASSERT(delta > 0 ? dd->dd_phys->dd_used_breakdown[oldtype] >= delta : dd->dd_phys->dd_used_breakdown[newtype] >= -delta); ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta)); + dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_used_breakdown[oldtype] -= delta; dd->dd_phys->dd_used_breakdown[newtype] += delta; if (needlock) diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 85c745e..089a7f0 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -42,6 +42,7 @@ #include #include #include +#include int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ @@ -224,12 +225,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) txg_list_create(&dp->dp_dirty_datasets, offsetof(dsl_dataset_t, ds_dirty_link)); + txg_list_create(&dp->dp_dirty_zilogs, + offsetof(zilog_t, zl_dirty_link)); txg_list_create(&dp->dp_dirty_dirs, offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, offsetof(dsl_sync_task_group_t, dstg_node)); - list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), - offsetof(dsl_dataset_t, ds_synced_link)); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); @@ -362,9 +363,9 @@ dsl_pool_close(dsl_pool_t *dp) dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); + txg_list_destroy(&dp->dp_dirty_zilogs); txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); - list_destroy(&dp->dp_synced_datasets); arc_flush(dp->dp_spa); txg_fini(dp); @@ -445,6 +446,21 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) return (dp); } +/* + * Account for the meta-objset space in its placeholder dsl_dir. + */ +void +dsl_pool_mos_diduse_space(dsl_pool_t *dp, + int64_t used, int64_t comp, int64_t uncomp) +{ + ASSERT3U(comp, ==, uncomp); /* it's all metadata */ + mutex_enter(&dp->dp_lock); + dp->dp_mos_used_delta += used; + dp->dp_mos_compressed_delta += comp; + dp->dp_mos_uncompressed_delta += uncomp; + mutex_exit(&dp->dp_lock); +} + static int deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { @@ -463,11 +479,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; - dsl_sync_task_group_t *dstg; objset_t *mos = dp->dp_meta_objset; hrtime_t start, write_time; uint64_t data_written; int err; + list_t synced_datasets; + + list_create(&synced_datasets, sizeof (dsl_dataset_t), + offsetof(dsl_dataset_t, ds_synced_link)); /* * We need to copy dp_space_towrite() before doing @@ -490,7 +509,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * may sync newly-created datasets on pass 2. */ ASSERT(!list_link_active(&ds->ds_synced_link)); - list_insert_tail(&dp->dp_synced_datasets, ds); + list_insert_tail(&synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } DTRACE_PROBE(pool_sync__1setup); @@ -500,15 +519,20 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) ASSERT(err == 0); DTRACE_PROBE(pool_sync__2rootzio); - for (ds = list_head(&dp->dp_synced_datasets); ds; - ds = list_next(&dp->dp_synced_datasets, ds)) + /* + * After the data blocks have been written (ensured by the zio_wait() + * above), update the user/group space accounting. + */ + for (ds = list_head(&synced_datasets); ds; + ds = list_next(&synced_datasets, ds)) dmu_objset_do_userquota_updates(ds->ds_objset, tx); /* * Sync the datasets again to push out the changes due to * userspace updates. This must be done before we process the - * sync tasks, because that could cause a snapshot of a dataset - * whose ds_bp will be rewritten when we do this 2nd sync. + * sync tasks, so that any snapshots will have the correct + * user accounting information (and we won't get confused + * about which blocks are part of the snapshot). */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) { @@ -519,30 +543,42 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) err = zio_wait(zio); /* - * Move dead blocks from the pending deadlist to the on-disk - * deadlist. + * Now that the datasets have been completely synced, we can + * clean up our in-memory structures accumulated while syncing: + * + * - move dead blocks from the pending deadlist to the on-disk deadlist + * - clean up zil records + * - release hold from dsl_dataset_dirty() */ - for (ds = list_head(&dp->dp_synced_datasets); ds; - ds = list_next(&dp->dp_synced_datasets, ds)) { + while ((ds = list_remove_head(&synced_datasets))) { + ASSERTV(objset_t *os = ds->ds_objset); bplist_iterate(&ds->ds_pending_deadlist, deadlist_enqueue_cb, &ds->ds_deadlist, tx); + ASSERT(!dmu_objset_is_dirty(os, txg)); + dmu_buf_rele(ds->ds_dbuf, ds); } - while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) { - /* - * No more sync tasks should have been added while we - * were syncing. - */ - ASSERT(spa_sync_pass(dp->dp_spa) == 1); - dsl_sync_task_group_sync(dstg, tx); - } - DTRACE_PROBE(pool_sync__3task); - start = gethrtime(); while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg))) dsl_dir_sync(dd, tx); write_time += gethrtime() - start; + /* + * The MOS's space is accounted for in the pool/$MOS + * (dp_mos_dir). We can't modify the mos while we're syncing + * it, so we remember the deltas and apply them here. + */ + if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || + dp->dp_mos_uncompressed_delta != 0) { + dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, + dp->dp_mos_used_delta, + dp->dp_mos_compressed_delta, + dp->dp_mos_uncompressed_delta, tx); + dp->dp_mos_used_delta = 0; + dp->dp_mos_compressed_delta = 0; + dp->dp_mos_uncompressed_delta = 0; + } + start = gethrtime(); if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { @@ -558,6 +594,27 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) hrtime_t, dp->dp_read_overhead); write_time -= dp->dp_read_overhead; + /* + * If we modify a dataset in the same txg that we want to destroy it, + * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. + * dsl_dir_destroy_check() will fail if there are unexpected holds. + * Therefore, we want to sync the MOS (thus syncing the dd_dbuf + * and clearing the hold on it) before we process the sync_tasks. + * The MOS data dirtied by the sync_tasks will be synced on the next + * pass. + */ + DTRACE_PROBE(pool_sync__3task); + if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { + dsl_sync_task_group_t *dstg; + /* + * No more sync tasks should have been added while we + * were syncing. + */ + ASSERT(spa_sync_pass(dp->dp_spa) == 1); + while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) + dsl_sync_task_group_sync(dstg, tx); + } + dmu_tx_commit(tx); dp->dp_space_towrite[txg & TXG_MASK] = 0; @@ -606,15 +663,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) { + zilog_t *zilog; dsl_dataset_t *ds; - objset_t *os; - while ((ds = list_head(&dp->dp_synced_datasets))) { - list_remove(&dp->dp_synced_datasets, ds); - os = ds->ds_objset; - zil_clean(os->os_zil, txg); - ASSERT(!dmu_objset_is_dirty(os, txg)); - dmu_buf_rele(ds->ds_dbuf, ds); + while ((zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg))) { + ds = dmu_objset_ds(zilog->zl_os); + zil_clean(zilog, txg); + ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); + dmu_buf_rele(ds->ds_dbuf, zilog); } ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 17494bc..838a6f6 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2011 Martin Matuska + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -669,7 +671,7 @@ txg_list_destroy(txg_list_t *tl) mutex_destroy(&tl->tl_lock); } -int +boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg) { return (tl->tl_head[txg & TXG_MASK] == NULL); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 220f2d7..c9618c1 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -480,6 +480,38 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite) } /* + * Called when we create in-memory log transactions so that we know + * to cleanup the itxs at the end of spa_sync(). + */ +void +zilog_dirty(zilog_t *zilog, uint64_t txg) +{ + dsl_pool_t *dp = zilog->zl_dmu_pool; + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); + + if (dsl_dataset_is_snapshot(ds)) + panic("dirtying snapshot!"); + + if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg) == 0) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(ds->ds_dbuf, zilog); + } +} + +boolean_t +zilog_is_dirty(zilog_t *zilog) +{ + dsl_pool_t *dp = zilog->zl_dmu_pool; + int t; + + for (t = 0; t < TXG_SIZE; t++) { + if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) + return (B_TRUE); + } + return (B_FALSE); +} + +/* * Create an on-disk intent log. */ static lwb_t * @@ -601,14 +633,21 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) kmem_cache_free(zil_lwb_cache, lwb); } } else if (!keep_first) { - (void) zil_parse(zilog, zil_free_log_block, - zil_free_log_record, tx, zh->zh_claim_txg); + zil_destroy_sync(zilog, tx); } mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); } +void +zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) +{ + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + (void) zil_parse(zilog, zil_free_log_block, + zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); +} + int zil_claim(const char *osname, void *txarg) { @@ -1042,6 +1081,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) return (NULL); ASSERT(lwb->lwb_buf != NULL); + ASSERT(zilog_is_dirty(zilog) || + spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) dlen = P2ROUNDUP_TYPED( @@ -1272,7 +1313,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) zil_async_to_sync(zilog, itx->itx_oid); - if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) txg = ZILTEST_TXG; else txg = dmu_tx_get_txg(tx); @@ -1323,6 +1364,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) } itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); + zilog_dirty(zilog, txg); mutex_exit(&itxg->itxg_lock); /* Release the old itxs now we've dropped the lock */ @@ -1332,7 +1374,10 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) /* * If there are any in-memory intent log transactions which have now been - * synced then start up a taskq to free them. + * synced then start up a taskq to free them. We should only do this after we + * have written out the uberblocks (i.e. txg has been comitted) so that + * don't inadvertently clean out in-memory log records that would be required + * by zil_commit(). */ void zil_clean(zilog_t *zilog, uint64_t synced_txg) @@ -1837,6 +1882,7 @@ zil_close(zilog_t *zilog) mutex_exit(&zilog->zl_lock); if (txg) txg_wait_synced(zilog->zl_dmu_pool, txg); + ASSERT(!zilog_is_dirty(zilog)); taskq_destroy(zilog->zl_clean_taskq); zilog->zl_clean_taskq = NULL; -- 1.8.3.1