X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fdsl_pool.c;h=7795d8045e92d71c148c7594e64c218ceb31e9e0;hb=refs%2Fheads%2Frertzinger%2Ffeature-zpool-get--p;hp=7e0fba58932ce543bb4709af12703bc5eea1c08c;hpb=e95853a331529a6cb96fdf10476c53441e59f4e1;p=zfs.git diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 7e0fba5..7795d80 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -40,6 +40,9 @@ #include #include #include +#include +#include +#include int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ @@ -55,6 +58,63 @@ kmutex_t zfs_write_limit_lock; static pgcnt_t old_physmem = 0; +static void +dsl_pool_tx_assign_init(dsl_pool_t *dp, unsigned int ndata) +{ + kstat_named_t *ks; + char name[KSTAT_STRLEN]; + int i, data_size = ndata * sizeof(kstat_named_t); + + (void) snprintf(name, KSTAT_STRLEN, "dmu_tx_assign-%s", + spa_name(dp->dp_spa)); + + dp->dp_tx_assign_size = ndata; + + if (data_size) + dp->dp_tx_assign_buckets = kmem_alloc(data_size, KM_SLEEP); + else + dp->dp_tx_assign_buckets = NULL; + + for (i = 0; i < dp->dp_tx_assign_size; i++) { + ks = &dp->dp_tx_assign_buckets[i]; + ks->data_type = KSTAT_DATA_UINT64; + ks->value.ui64 = 0; + (void) snprintf(ks->name, KSTAT_STRLEN, "%u us", 1 << i); + } + + dp->dp_tx_assign_kstat = kstat_create("zfs", 0, name, "misc", + KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); + + if (dp->dp_tx_assign_kstat) { + dp->dp_tx_assign_kstat->ks_data = dp->dp_tx_assign_buckets; + dp->dp_tx_assign_kstat->ks_ndata = dp->dp_tx_assign_size; + dp->dp_tx_assign_kstat->ks_data_size = data_size; + kstat_install(dp->dp_tx_assign_kstat); + } +} + +static void +dsl_pool_tx_assign_destroy(dsl_pool_t *dp) +{ + if (dp->dp_tx_assign_buckets) + kmem_free(dp->dp_tx_assign_buckets, + dp->dp_tx_assign_size * sizeof(kstat_named_t)); + + if (dp->dp_tx_assign_kstat) + kstat_delete(dp->dp_tx_assign_kstat); +} + +void +dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs) +{ + uint64_t idx = 0; + + while (((1 << idx) < usecs) && (idx < dp->dp_tx_assign_size - 1)) + idx++; + + atomic_inc_64(&dp->dp_tx_assign_buckets[idx].value.ui64); +} + static int dsl_pool_txg_history_update(kstat_t *ksp, int rw) { @@ -140,7 +200,7 @@ dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg) { txg_history_t *th, *rm; - th = kmem_zalloc(sizeof(txg_history_t), KM_SLEEP); + th = kmem_zalloc(sizeof(txg_history_t), KM_PUSHPAGE); mutex_init(&th->th_lock, NULL, MUTEX_DEFAULT, NULL); th->th_kstat.txg = txg; th->th_kstat.state = TXG_STATE_OPEN; @@ -222,12 +282,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) txg_list_create(&dp->dp_dirty_datasets, offsetof(dsl_dataset_t, ds_dirty_link)); + txg_list_create(&dp->dp_dirty_zilogs, + offsetof(zilog_t, zl_dirty_link)); txg_list_create(&dp->dp_dirty_dirs, offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, offsetof(dsl_sync_task_group_t, dstg_node)); - list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), - offsetof(dsl_dataset_t, ds_synced_link)); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); @@ -235,25 +295,36 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) 1, 4, 0); dsl_pool_txg_history_init(dp, txg); + dsl_pool_tx_assign_init(dp, 32); return (dp); } int -dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) +dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + + err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, + &dp->dp_meta_objset); + if (err != 0) + dsl_pool_close(dp); + else + *dpp = dp; + + return (err); +} + +int +dsl_pool_open(dsl_pool_t *dp) +{ + int err; dsl_dir_t *dd; dsl_dataset_t *ds; uint64_t obj; rw_enter(&dp->dp_config_rwlock, RW_WRITER); - err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, - &dp->dp_meta_objset); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); @@ -269,7 +340,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - if (spa_version(spa) >= SPA_VERSION_ORIGIN) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); if (err) goto out; @@ -286,7 +357,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) goto out; } - if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir); if (err) @@ -300,6 +371,24 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) dp->dp_meta_objset, obj)); } + if (spa_feature_is_active(dp->dp_spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj); + if (err != 0) + goto out; + } + + if (spa_feature_is_active(dp->dp_spa, + &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, + &dp->dp_empty_bpobj); + if (err != 0) + goto out; + } + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj); @@ -308,15 +397,10 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - err = dsl_scan_init(dp, txg); + err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); out: rw_exit(&dp->dp_config_rwlock); - if (err) - dsl_pool_close(dp); - else - *dpp = dp; - return (err); } @@ -346,13 +430,14 @@ dsl_pool_close(dsl_pool_t *dp) dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); + txg_list_destroy(&dp->dp_dirty_zilogs); txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); - list_destroy(&dp->dp_synced_datasets); arc_flush(dp->dp_spa); txg_fini(dp); dsl_scan_fini(dp); + dsl_pool_tx_assign_destroy(dp); dsl_pool_txg_history_destroy(dp); rw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); @@ -379,7 +464,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) /* create the pool directory */ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); - ASSERT3U(err, ==, 0); + ASSERT0(err); /* Initialize scan structures */ VERIFY3U(0, ==, dsl_scan_init(dp, txg)); @@ -429,6 +514,21 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) return (dp); } +/* + * Account for the meta-objset space in its placeholder dsl_dir. + */ +void +dsl_pool_mos_diduse_space(dsl_pool_t *dp, + int64_t used, int64_t comp, int64_t uncomp) +{ + ASSERT3U(comp, ==, uncomp); /* it's all metadata */ + mutex_enter(&dp->dp_lock); + dp->dp_mos_used_delta += used; + dp->dp_mos_compressed_delta += comp; + dp->dp_mos_uncompressed_delta += uncomp; + mutex_exit(&dp->dp_lock); +} + static int deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { @@ -447,11 +547,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; - dsl_sync_task_group_t *dstg; objset_t *mos = dp->dp_meta_objset; hrtime_t start, write_time; uint64_t data_written; int err; + list_t synced_datasets; + + list_create(&synced_datasets, sizeof (dsl_dataset_t), + offsetof(dsl_dataset_t, ds_synced_link)); /* * We need to copy dp_space_towrite() before doing @@ -474,7 +577,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * may sync newly-created datasets on pass 2. */ ASSERT(!list_link_active(&ds->ds_synced_link)); - list_insert_tail(&dp->dp_synced_datasets, ds); + list_insert_tail(&synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } DTRACE_PROBE(pool_sync__1setup); @@ -484,15 +587,20 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) ASSERT(err == 0); DTRACE_PROBE(pool_sync__2rootzio); - for (ds = list_head(&dp->dp_synced_datasets); ds; - ds = list_next(&dp->dp_synced_datasets, ds)) + /* + * After the data blocks have been written (ensured by the zio_wait() + * above), update the user/group space accounting. + */ + for (ds = list_head(&synced_datasets); ds; + ds = list_next(&synced_datasets, ds)) dmu_objset_do_userquota_updates(ds->ds_objset, tx); /* * Sync the datasets again to push out the changes due to * userspace updates. This must be done before we process the - * sync tasks, because that could cause a snapshot of a dataset - * whose ds_bp will be rewritten when we do this 2nd sync. + * sync tasks, so that any snapshots will have the correct + * user accounting information (and we won't get confused + * about which blocks are part of the snapshot). */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) { @@ -503,30 +611,42 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) err = zio_wait(zio); /* - * Move dead blocks from the pending deadlist to the on-disk - * deadlist. + * Now that the datasets have been completely synced, we can + * clean up our in-memory structures accumulated while syncing: + * + * - move dead blocks from the pending deadlist to the on-disk deadlist + * - clean up zil records + * - release hold from dsl_dataset_dirty() */ - for (ds = list_head(&dp->dp_synced_datasets); ds; - ds = list_next(&dp->dp_synced_datasets, ds)) { + while ((ds = list_remove_head(&synced_datasets))) { + ASSERTV(objset_t *os = ds->ds_objset); bplist_iterate(&ds->ds_pending_deadlist, deadlist_enqueue_cb, &ds->ds_deadlist, tx); + ASSERT(!dmu_objset_is_dirty(os, txg)); + dmu_buf_rele(ds->ds_dbuf, ds); } - while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) { - /* - * No more sync tasks should have been added while we - * were syncing. - */ - ASSERT(spa_sync_pass(dp->dp_spa) == 1); - dsl_sync_task_group_sync(dstg, tx); - } - DTRACE_PROBE(pool_sync__3task); - start = gethrtime(); while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg))) dsl_dir_sync(dd, tx); write_time += gethrtime() - start; + /* + * The MOS's space is accounted for in the pool/$MOS + * (dp_mos_dir). We can't modify the mos while we're syncing + * it, so we remember the deltas and apply them here. + */ + if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || + dp->dp_mos_uncompressed_delta != 0) { + dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, + dp->dp_mos_used_delta, + dp->dp_mos_compressed_delta, + dp->dp_mos_uncompressed_delta, tx); + dp->dp_mos_used_delta = 0; + dp->dp_mos_compressed_delta = 0; + dp->dp_mos_uncompressed_delta = 0; + } + start = gethrtime(); if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { @@ -542,6 +662,27 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) hrtime_t, dp->dp_read_overhead); write_time -= dp->dp_read_overhead; + /* + * If we modify a dataset in the same txg that we want to destroy it, + * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. + * dsl_dir_destroy_check() will fail if there are unexpected holds. + * Therefore, we want to sync the MOS (thus syncing the dd_dbuf + * and clearing the hold on it) before we process the sync_tasks. + * The MOS data dirtied by the sync_tasks will be synced on the next + * pass. + */ + DTRACE_PROBE(pool_sync__3task); + if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { + dsl_sync_task_group_t *dstg; + /* + * No more sync tasks should have been added while we + * were syncing. + */ + ASSERT(spa_sync_pass(dp->dp_spa) == 1); + while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) + dsl_sync_task_group_sync(dstg, tx); + } + dmu_tx_commit(tx); dp->dp_space_towrite[txg & TXG_MASK] = 0; @@ -590,15 +731,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) { + zilog_t *zilog; dsl_dataset_t *ds; - objset_t *os; - while ((ds = list_head(&dp->dp_synced_datasets))) { - list_remove(&dp->dp_synced_datasets, ds); - os = ds->ds_objset; - zil_clean(os->os_zil, txg); - ASSERT(!dmu_objset_is_dirty(os, txg)); - dmu_buf_rele(ds->ds_dbuf, ds); + while ((zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg))) { + ds = dmu_objset_ds(zilog->zl_os); + zil_clean(zilog, txg); + ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); + dmu_buf_rele(ds->ds_dbuf, zilog); } ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } @@ -611,7 +751,7 @@ int dsl_pool_sync_context(dsl_pool_t *dp) { return (curthread == dp->dp_tx.tx_sync_thread || - spa_get_dsl(dp->dp_spa) == NULL); + spa_is_initializing(dp->dp_spa)); } uint64_t @@ -932,11 +1072,8 @@ dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT(dp->dp_tmp_userrefs_obj == 0); ASSERT(dmu_tx_is_syncing(tx)); - dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, - DMU_OT_NONE, 0, tx); - - VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, - sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0); + dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); } static int