X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fdmu_tx.c;h=b0dc64f0603bbecc3ce381b4240b444842794f01;hb=92334b14ec378b1693573b52c09816bbade9cf3e;hp=fa64b3af602bb558e824f638e0b3d2f8bf785e7c;hpb=c28b227942b421ebdc03c9df9a012642fb517223;p=zfs.git diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index fa64b3a..b0dc64f 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #include @@ -40,11 +42,27 @@ typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); +dmu_tx_stats_t dmu_tx_stats = { + { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, + { "dmu_tx_delay", KSTAT_DATA_UINT64 }, + { "dmu_tx_error", KSTAT_DATA_UINT64 }, + { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, + { "dmu_tx_group", KSTAT_DATA_UINT64 }, + { "dmu_tx_how", KSTAT_DATA_UINT64 }, + { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, + { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, + { "dmu_tx_memory_inflight", KSTAT_DATA_UINT64 }, + { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, + { "dmu_tx_write_limit", KSTAT_DATA_UINT64 }, + { "dmu_tx_quota", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *dmu_tx_ksp; dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { - dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); + dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_PUSHPAGE); tx->tx_dir = dd; if (dd) tx->tx_pool = dd->dd_pool; @@ -52,7 +70,7 @@ dmu_tx_create_dd(dsl_dir_t *dd) offsetof(dmu_tx_hold_t, txh_node)); list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX refcount_create(&tx->tx_space_written); refcount_create(&tx->tx_space_freed); #endif @@ -122,10 +140,10 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, } } - txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); + txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_PUSHPAGE); txh->txh_tx = tx; txh->txh_dnode = dn; -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX txh->txh_type = type; txh->txh_arg1 = arg1; txh->txh_arg2 = arg2; @@ -283,6 +301,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) delta = P2NPHASE(off, dn->dn_datablksz); } + min_ibs = max_ibs = dn->dn_indblkshift; if (dn->dn_maxblkid > 0) { /* * The blocksize can't change, @@ -290,13 +309,6 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) */ ASSERT(dn->dn_datablkshift != 0); min_bs = max_bs = dn->dn_datablkshift; - min_ibs = max_ibs = dn->dn_indblkshift; - } else if (dn->dn_indblkshift > max_ibs) { - /* - * This ensures that if we reduce DN_MAX_INDBLKSHIFT, - * the code will still work correctly on older pools. - */ - min_ibs = max_ibs = dn->dn_indblkshift; } /* @@ -428,6 +440,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; spa_t *spa = txh->txh_tx->tx_pool->dp_spa; int epbs; + uint64_t l0span = 0, nl1blks = 0; if (dn->dn_nlevels == 0) return; @@ -460,6 +473,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) nblks = dn->dn_maxblkid - blkid; } + l0span = nblks; /* save for later use to calc level > 1 overhead */ if (dn->dn_nlevels == 1) { int i; for (i = 0; i < nblks; i++) { @@ -472,24 +486,10 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) } unref += BP_GET_ASIZE(bp); } + nl1blks = 1; nblks = 0; } - /* - * Add in memory requirements of higher-level indirects. - * This assumes a worst-possible scenario for dn_nlevels. - */ - { - uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); - int level = (dn->dn_nlevels > 1) ? 2 : 1; - - while (level++ < DN_MAX_LEVELS) { - txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; - blkcnt = 1 + (blkcnt >> epbs); - } - ASSERT(blkcnt <= dn->dn_nblkptr); - } - lastblk = blkid + nblks - 1; while (nblks) { dmu_buf_impl_t *dbuf; @@ -560,11 +560,35 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) } dbuf_rele(dbuf, FTAG); + ++nl1blks; blkid += tochk; nblks -= tochk; } rw_exit(&dn->dn_struct_rwlock); + /* + * Add in memory requirements of higher-level indirects. + * This assumes a worst-possible scenario for dn_nlevels and a + * worst-possible distribution of l1-blocks over the region to free. + */ + { + uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs); + int level = 2; + /* + * Here we don't use DN_MAX_LEVEL, but calculate it with the + * given datablkshift and indblkshift. This makes the + * difference between 19 and 8 on large files. + */ + int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) / + (dn->dn_indblkshift - SPA_BLKPTRSHIFT); + + while (level++ < maxlevel) { + txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1) + << dn->dn_indblkshift; + blkcnt = 1 + (blkcnt >> epbs); + } + } + /* account for new level 1 indirect blocks that might show up */ if (skipped > 0) { txh->txh_fudge += skipped << dn->dn_indblkshift; @@ -674,9 +698,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) return; } - ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 && !add) { + blkptr_t *bp; + /* * If there is only one block (i.e. this is a micro-zap) * and we are not adding anything, the accounting is simple. @@ -691,14 +717,13 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) * Use max block size here, since we don't know how much * the size will change between now and the dbuf dirty call. */ + bp = &dn->dn_phys->dn_blkptr[0]; if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - &dn->dn_phys->dn_blkptr[0], - dn->dn_phys->dn_blkptr[0].blk_birth)) { + bp, bp->blk_birth)) txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; - } else { + else txh->txh_space_towrite += SPA_MAXBLOCKSIZE; - } - if (dn->dn_phys->dn_blkptr[0].blk_birth) + if (!BP_IS_HOLE(bp)) txh->txh_space_tounref += SPA_MAXBLOCKSIZE; return; } @@ -782,7 +807,7 @@ dmu_tx_holds(dmu_tx_t *tx, uint64_t object) return (holds); } -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX void dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { @@ -792,6 +817,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) DB_DNODE_ENTER(db); dn = DB_DNODE(db); + ASSERT(dn != NULL); ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); @@ -809,7 +835,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { - ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) match_object = TRUE; if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { @@ -897,12 +923,16 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) uint64_t memory, asize, fsize, usize; uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; - ASSERT3U(tx->tx_txg, ==, 0); + ASSERT0(tx->tx_txg); - if (tx->tx_err) + if (tx->tx_err) { + DMU_TX_STAT_BUMP(dmu_tx_error); return (tx->tx_err); + } if (spa_suspended(spa)) { + DMU_TX_STAT_BUMP(dmu_tx_suspended); + /* * If the user has indicated a blocking failure mode * then return ERESTART which will block in dmu_tx_wait(). @@ -937,6 +967,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) if (dn->dn_assigned_txg == tx->tx_txg - 1) { mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = txh; + DMU_TX_STAT_BUMP(dmu_tx_group); return (ERESTART); } if (dn->dn_assigned_txg == 0) @@ -957,8 +988,10 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) * NB: This check must be after we've held the dnodes, so that * the dmu_tx_unassign() logic will work properly */ - if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) + if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) { + DMU_TX_STAT_BUMP(dmu_tx_how); return (ERESTART); + } /* * If a snapshot has been taken since we made our estimates, @@ -980,7 +1013,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) /* calculate memory footprint estimate */ memory = towrite + tooverwrite + tohold; -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX /* * Add in 'tohold' to account for our dirty holds on this memory * XXX - the "fudge" factor is to account for skipped blocks that @@ -1000,6 +1033,8 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) return (err); } + DMU_TX_STAT_BUMP(dmu_tx_assigned); + return (0); } @@ -1054,12 +1089,15 @@ dmu_tx_unassign(dmu_tx_t *tx) int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) { + hrtime_t before, after; int err; ASSERT(tx->tx_txg == 0); ASSERT(txg_how != 0); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); + before = gethrtime(); + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); @@ -1071,6 +1109,11 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) txg_rele_to_quiesce(&tx->tx_txgh); + after = gethrtime(); + + dsl_pool_tx_assign_add_usecs(tx->tx_pool, + (after - before) / NSEC_PER_USEC); + return (0); } @@ -1104,7 +1147,7 @@ dmu_tx_wait(dmu_tx_t *tx) void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) { -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX if (tx->tx_dir == NULL || delta == 0) return; @@ -1154,7 +1197,7 @@ dmu_tx_commit(dmu_tx_t *tx) list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", tx->tx_space_towrite, refcount_count(&tx->tx_space_written), tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); @@ -1190,7 +1233,7 @@ dmu_tx_abort(dmu_tx_t *tx) list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX refcount_destroy_many(&tx->tx_space_written, refcount_count(&tx->tx_space_written)); refcount_destroy_many(&tx->tx_space_freed, @@ -1211,7 +1254,7 @@ dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { dmu_tx_callback_t *dcb; - dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); + dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_PUSHPAGE); dcb->dcb_func = func; dcb->dcb_data = data; @@ -1274,7 +1317,6 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) { dnode_t *dn; dmu_tx_hold_t *txh; - blkptr_t *bp; txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_SPILL, 0, 0); @@ -1285,17 +1327,18 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) return; /* If blkptr doesn't exist then add space to towrite */ - bp = &dn->dn_phys->dn_spill; - if (BP_IS_HOLE(bp)) { + if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { txh->txh_space_towrite += SPA_MAXBLOCKSIZE; - txh->txh_space_tounref = 0; } else { + blkptr_t *bp; + + bp = &dn->dn_phys->dn_spill; if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, bp, bp->blk_birth)) txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; else txh->txh_space_towrite += SPA_MAXBLOCKSIZE; - if (bp->blk_birth) + if (!BP_IS_HOLE(bp)) txh->txh_space_tounref += SPA_MAXBLOCKSIZE; } } @@ -1382,6 +1425,28 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) } } +void +dmu_tx_init(void) +{ + dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", + KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (dmu_tx_ksp != NULL) { + dmu_tx_ksp->ks_data = &dmu_tx_stats; + kstat_install(dmu_tx_ksp); + } +} + +void +dmu_tx_fini(void) +{ + if (dmu_tx_ksp != NULL) { + kstat_delete(dmu_tx_ksp); + dmu_tx_ksp = NULL; + } +} + #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dmu_tx_create); EXPORT_SYMBOL(dmu_tx_hold_write);