X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fdmu_tx.c;h=fd714135a853329edb9297485a60afb8539cf312;hb=refs%2Fheads%2Frertzinger%2Ffeature-zpool-get--p;hp=af2b049a7155d1f54569f04f96c061499d12689f;hpb=9babb37438b58e77bad04e820d5702e15b79e6a6;p=zfs.git diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index af2b049..fd71413 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -19,8 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #include @@ -33,22 +34,43 @@ #include #include /* for fzap_default_block_shift */ #include +#include +#include #include +#include typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); +dmu_tx_stats_t dmu_tx_stats = { + { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, + { "dmu_tx_delay", KSTAT_DATA_UINT64 }, + { "dmu_tx_error", KSTAT_DATA_UINT64 }, + { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, + { "dmu_tx_group", KSTAT_DATA_UINT64 }, + { "dmu_tx_how", KSTAT_DATA_UINT64 }, + { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, + { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, + { "dmu_tx_memory_inflight", KSTAT_DATA_UINT64 }, + { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, + { "dmu_tx_write_limit", KSTAT_DATA_UINT64 }, + { "dmu_tx_quota", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *dmu_tx_ksp; dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { - dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); + dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_PUSHPAGE); tx->tx_dir = dd; if (dd) tx->tx_pool = dd->dd_pool; list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), offsetof(dmu_tx_hold_t, txh_node)); -#ifdef ZFS_DEBUG + list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); +#ifdef DEBUG_DMU_TX refcount_create(&tx->tx_space_written); refcount_create(&tx->tx_space_freed); #endif @@ -58,9 +80,9 @@ dmu_tx_create_dd(dsl_dir_t *dd) dmu_tx_t * dmu_tx_create(objset_t *os) { - dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir); + dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); tx->tx_objset = os; - tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset); + tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); return (tx); } @@ -98,7 +120,7 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, int err; if (object != DMU_NEW_OBJECT) { - err = dnode_hold(os->os, object, tx, &dn); + err = dnode_hold(os, object, tx, &dn); if (err) { tx->tx_err = err; return (NULL); @@ -118,10 +140,10 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, } } - txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); + txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_PUSHPAGE); txh->txh_tx = tx; txh->txh_dnode = dn; -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX txh->txh_type = type; txh->txh_arg1 = arg1; txh->txh_arg2 = arg2; @@ -161,38 +183,47 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) } static void -dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db, - boolean_t freeable, dmu_buf_impl_t **history) +dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, + int level, uint64_t blkid, boolean_t freeable, uint64_t *history) { - int i = db->db_level + 1; - dnode_t *dn = db->db_dnode; - - if (i >= dn->dn_nlevels) + objset_t *os = dn->dn_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + dmu_buf_impl_t *parent = NULL; + blkptr_t *bp = NULL; + uint64_t space; + + if (level >= dn->dn_nlevels || history[level] == blkid) return; - db = db->db_parent; - if (db == NULL) { - uint64_t lvls = dn->dn_nlevels - i; + history[level] = blkid; - txh->txh_space_towrite += lvls << dn->dn_indblkshift; - return; + space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); + + if (db == NULL || db == dn->dn_dbuf) { + ASSERT(level != 0); + db = NULL; + } else { + ASSERT(DB_DNODE(db) == dn); + ASSERT(db->db_level == level); + ASSERT(db->db.db_size == space); + ASSERT(db->db_blkid == blkid); + bp = db->db_blkptr; + parent = db->db_parent; } - if (db != history[i]) { - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - uint64_t space = 1ULL << dn->dn_indblkshift; + freeable = (bp && (freeable || + dsl_dataset_block_freeable(ds, bp, bp->blk_birth))); - freeable = (db->db_blkptr && (freeable || - dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth))); - if (freeable) - txh->txh_space_tooverwrite += space; - else - txh->txh_space_towrite += space; - if (db->db_blkptr) - txh->txh_space_tounref += space; - history[i] = db; - dmu_tx_count_indirects(txh, db, freeable, history); - } + if (freeable) + txh->txh_space_tooverwrite += space; + else + txh->txh_space_towrite += space; + if (bp) + txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp); + + dmu_tx_count_twig(txh, dn, parent, level + 1, + blkid >> epbs, freeable, history); } /* ARGSUSED */ @@ -203,6 +234,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) uint64_t start, end, i; int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; int err = 0; + int l; if (len == 0) return; @@ -213,7 +245,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) max_ibs = DN_MAX_INDBLKSHIFT; if (dn) { - dmu_buf_impl_t *last[DN_MAX_LEVELS]; + uint64_t history[DN_MAX_LEVELS]; int nlvls = dn->dn_nlevels; int delta; @@ -269,6 +301,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) delta = P2NPHASE(off, dn->dn_datablksz); } + min_ibs = max_ibs = dn->dn_indblkshift; if (dn->dn_maxblkid > 0) { /* * The blocksize can't change, @@ -276,42 +309,30 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) */ ASSERT(dn->dn_datablkshift != 0); min_bs = max_bs = dn->dn_datablkshift; - min_ibs = max_ibs = dn->dn_indblkshift; - } else if (dn->dn_indblkshift > max_ibs) { - /* - * This ensures that if we reduce DN_MAX_INDBLKSHIFT, - * the code will still work correctly on older pools. - */ - min_ibs = max_ibs = dn->dn_indblkshift; } /* * If this write is not off the end of the file * we need to account for overwrites/unref. */ - if (start <= dn->dn_maxblkid) - bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS); + if (start <= dn->dn_maxblkid) { + for (l = 0; l < DN_MAX_LEVELS; l++) + history[l] = -1ULL; + } while (start <= dn->dn_maxblkid) { - spa_t *spa = txh->txh_tx->tx_pool->dp_spa; - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); - db = dbuf_hold_level(dn, 0, start, FTAG); + err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); - if (db->db_blkptr && dsl_dataset_block_freeable(ds, - db->db_blkptr->blk_birth)) { - dprintf_bp(db->db_blkptr, "can free old%s", ""); - txh->txh_space_tooverwrite += dn->dn_datablksz; - txh->txh_space_tounref += dn->dn_datablksz; - dmu_tx_count_indirects(txh, db, TRUE, last); - } else { - txh->txh_space_towrite += dn->dn_datablksz; - if (db->db_blkptr) - txh->txh_space_tounref += - bp_get_dasize(spa, db->db_blkptr); - dmu_tx_count_indirects(txh, db, FALSE, last); + + if (err) { + txh->txh_tx->tx_err = err; + return; } + + dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, + history); dbuf_rele(db, FTAG); if (++start > end) { /* @@ -376,13 +397,13 @@ static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { dnode_t *dn = txh->txh_dnode; - dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode; + dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); uint64_t space = mdn->dn_datablksz + ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); if (dn && dn->dn_dbuf->db_blkptr && dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_dbuf->db_blkptr->blk_birth)) { + dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) { txh->txh_space_tooverwrite += space; txh->txh_space_tounref += space; } else { @@ -419,6 +440,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; spa_t *spa = txh->txh_tx->tx_pool->dp_spa; int epbs; + uint64_t l0span = 0, nl1blks = 0; if (dn->dn_nlevels == 0) return; @@ -427,7 +449,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) * The struct_rwlock protects us against dn_nlevels * changing, in case (against all odds) we manage to dirty & * sync out the changes after we check for being dirty. - * Also, dbuf_hold_level() wants us to have the struct_rwlock. + * Also, dbuf_hold_impl() wants us to have the struct_rwlock. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; @@ -451,36 +473,23 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) nblks = dn->dn_maxblkid - blkid; } + l0span = nblks; /* save for later use to calc level > 1 overhead */ if (dn->dn_nlevels == 1) { int i; for (i = 0; i < nblks; i++) { blkptr_t *bp = dn->dn_phys->dn_blkptr; ASSERT3U(blkid + i, <, dn->dn_nblkptr); bp += blkid + i; - if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { + if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) { dprintf_bp(bp, "can free old%s", ""); - space += bp_get_dasize(spa, bp); + space += bp_get_dsize(spa, bp); } unref += BP_GET_ASIZE(bp); } + nl1blks = 1; nblks = 0; } - /* - * Add in memory requirements of higher-level indirects. - * This assumes a worst-possible scenario for dn_nlevels. - */ - { - uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); - int level = (dn->dn_nlevels > 1) ? 2 : 1; - - while (level++ < DN_MAX_LEVELS) { - txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; - blkcnt = 1 + (blkcnt >> epbs); - } - ASSERT(blkcnt <= dn->dn_nblkptr); - } - lastblk = blkid + nblks - 1; while (nblks) { dmu_buf_impl_t *dbuf; @@ -515,14 +524,22 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) blkoff = P2PHASE(blkid, epb); tochk = MIN(epb - blkoff, nblks); - dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); - - txh->txh_memory_tohold += dbuf->db.db_size; - if (txh->txh_memory_tohold > DMU_MAX_ACCESS) { - txh->txh_tx->tx_err = E2BIG; - dbuf_rele(dbuf, FTAG); + err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); + if (err) { + txh->txh_tx->tx_err = err; break; } + + txh->txh_memory_tohold += dbuf->db.db_size; + + /* + * We don't check memory_tohold against DMU_MAX_ACCESS because + * memory_tohold is an over-estimation (especially the >L1 + * indirect blocks), so it could fail. Callers should have + * already verified that they will not be holding too much + * memory. + */ + err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); if (err != 0) { txh->txh_tx->tx_err = err; @@ -534,19 +551,44 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) bp += blkoff; for (i = 0; i < tochk; i++) { - if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) { + if (dsl_dataset_block_freeable(ds, &bp[i], + bp[i].blk_birth)) { dprintf_bp(&bp[i], "can free old%s", ""); - space += bp_get_dasize(spa, &bp[i]); + space += bp_get_dsize(spa, &bp[i]); } unref += BP_GET_ASIZE(bp); } dbuf_rele(dbuf, FTAG); + ++nl1blks; blkid += tochk; nblks -= tochk; } rw_exit(&dn->dn_struct_rwlock); + /* + * Add in memory requirements of higher-level indirects. + * This assumes a worst-possible scenario for dn_nlevels and a + * worst-possible distribution of l1-blocks over the region to free. + */ + { + uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs); + int level = 2; + /* + * Here we don't use DN_MAX_LEVEL, but calculate it with the + * given datablkshift and indblkshift. This makes the + * difference between 19 and 8 on large files. + */ + int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) / + (dn->dn_indblkshift - SPA_BLKPTRSHIFT); + + while (level++ < maxlevel) { + txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1) + << dn->dn_indblkshift; + blkcnt = 1 + (blkcnt >> epbs); + } + } + /* account for new level 1 indirect blocks that might show up */ if (skipped > 0) { txh->txh_fudge += skipped << dn->dn_indblkshift; @@ -581,6 +623,8 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) if (len != DMU_OBJECT_END) dmu_tx_count_write(txh, off+len, 1); + dmu_tx_count_dnode(txh); + if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) @@ -623,7 +667,6 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) } } - dmu_tx_count_dnode(txh); dmu_tx_count_free(txh, off, len); } @@ -655,9 +698,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) return; } - ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 && !add) { + blkptr_t *bp; + /* * If there is only one block (i.e. this is a micro-zap) * and we are not adding anything, the accounting is simple. @@ -672,13 +717,13 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) * Use max block size here, since we don't know how much * the size will change between now and the dbuf dirty call. */ + bp = &dn->dn_phys->dn_blkptr[0]; if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_phys->dn_blkptr[0].blk_birth)) { + bp, bp->blk_birth)) txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; - } else { + else txh->txh_space_towrite += SPA_MAXBLOCKSIZE; - } - if (dn->dn_phys->dn_blkptr[0].blk_birth) + if (!BP_IS_HOLE(bp)) txh->txh_space_tounref += SPA_MAXBLOCKSIZE; return; } @@ -688,7 +733,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) * access the name in this fat-zap so that we'll check * for i/o errors to the leaf blocks, etc. */ - err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name, + err = zap_lookup(dn->dn_objset, dn->dn_object, name, 8, 0, NULL); if (err == EIO) { tx->tx_err = err; @@ -696,9 +741,8 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) } } - err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add, - &txh->txh_space_towrite, &txh->txh_space_tooverwrite, - txh->txh_dnode->dn_datablkshift); + err = zap_count_write(dn->dn_objset, dn->dn_object, name, add, + &txh->txh_space_towrite, &txh->txh_space_tooverwrite); /* * If the modified blocks are scattered to the four winds, @@ -729,12 +773,13 @@ void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) { dmu_tx_hold_t *txh; + ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE, space, 0); - - txh->txh_space_towrite += space; + if (txh) + txh->txh_space_towrite += space; } int @@ -763,28 +808,35 @@ dmu_tx_holds(dmu_tx_t *tx, uint64_t object) return (holds); } -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX void dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { dmu_tx_hold_t *txh; int match_object = FALSE, match_offset = FALSE; - dnode_t *dn = db->db_dnode; + dnode_t *dn; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(dn != NULL); ASSERT(tx->tx_txg != 0); - ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os); + ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); - if (tx->tx_anyobj) + if (tx->tx_anyobj) { + DB_DNODE_EXIT(db); return; + } /* XXX No checking on the meta dnode for now */ - if (db->db.db_object == DMU_META_DNODE_OBJECT) + if (db->db.db_object == DMU_META_DNODE_OBJECT) { + DB_DNODE_EXIT(db); return; + } for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { - ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) match_object = TRUE; if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { @@ -809,10 +861,11 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) match_offset = TRUE; /* * We will let this hold work for the bonus - * buffer so that we don't need to hold it - * when creating a new object. + * or spill buffer so that we don't need to + * hold it when creating a new object. */ - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID || + blkid == DMU_SPILL_BLKID) match_offset = TRUE; /* * They might have to increase nlevels, @@ -833,8 +886,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; + case THT_SPILL: + if (blkid == DMU_SPILL_BLKID) + match_offset = TRUE; + break; case THT_BONUS: - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID) match_offset = TRUE; break; case THT_ZAP: @@ -847,9 +904,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) ASSERT(!"bad txh_type"); } } - if (match_object && match_offset) + if (match_object && match_offset) { + DB_DNODE_EXIT(db); return; + } } + DB_DNODE_EXIT(db); panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", (u_longlong_t)db->db.db_object, db->db_level, (u_longlong_t)db->db_blkid); @@ -864,12 +924,16 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) uint64_t memory, asize, fsize, usize; uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; - ASSERT3U(tx->tx_txg, ==, 0); + ASSERT0(tx->tx_txg); - if (tx->tx_err) + if (tx->tx_err) { + DMU_TX_STAT_BUMP(dmu_tx_error); return (tx->tx_err); + } if (spa_suspended(spa)) { + DMU_TX_STAT_BUMP(dmu_tx_suspended); + /* * If the user has indicated a blocking failure mode * then return ERESTART which will block in dmu_tx_wait(). @@ -904,6 +968,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) if (dn->dn_assigned_txg == tx->tx_txg - 1) { mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = txh; + DMU_TX_STAT_BUMP(dmu_tx_group); return (ERESTART); } if (dn->dn_assigned_txg == 0) @@ -924,15 +989,17 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) * NB: This check must be after we've held the dnodes, so that * the dmu_tx_unassign() logic will work properly */ - if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) + if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) { + DMU_TX_STAT_BUMP(dmu_tx_how); return (ERESTART); + } /* * If a snapshot has been taken since we made our estimates, * assume that we won't be able to free or overwrite anything. */ if (tx->tx_objset && - dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) > + dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > tx->tx_lastsnap_txg) { towrite += tooverwrite; tooverwrite = tofree = 0; @@ -947,7 +1014,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) /* calculate memory footprint estimate */ memory = towrite + tooverwrite + tohold; -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX /* * Add in 'tohold' to account for our dirty holds on this memory * XXX - the "fudge" factor is to account for skipped blocks that @@ -967,6 +1034,8 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) return (err); } + DMU_TX_STAT_BUMP(dmu_tx_assigned); + return (0); } @@ -1021,12 +1090,15 @@ dmu_tx_unassign(dmu_tx_t *tx) int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) { + hrtime_t before, after; int err; ASSERT(tx->tx_txg == 0); ASSERT(txg_how != 0); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); + before = gethrtime(); + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); @@ -1038,6 +1110,11 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) txg_rele_to_quiesce(&tx->tx_txgh); + after = gethrtime(); + + dsl_pool_tx_assign_add_usecs(tx->tx_pool, + (after - before) / NSEC_PER_USEC); + return (0); } @@ -1071,7 +1148,7 @@ dmu_tx_wait(dmu_tx_t *tx) void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) { -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX if (tx->tx_dir == NULL || delta == 0) return; @@ -1092,7 +1169,7 @@ dmu_tx_commit(dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); - while (txh = list_head(&tx->tx_holds)) { + while ((txh = list_head(&tx->tx_holds))) { dnode_t *dn = txh->txh_dnode; list_remove(&tx->tx_holds, txh); @@ -1113,10 +1190,15 @@ dmu_tx_commit(dmu_tx_t *tx) if (tx->tx_tempreserve_cookie) dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); + if (!list_is_empty(&tx->tx_callbacks)) + txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); + if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); + + list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", tx->tx_space_towrite, refcount_count(&tx->tx_space_written), tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); @@ -1135,7 +1217,7 @@ dmu_tx_abort(dmu_tx_t *tx) ASSERT(tx->tx_txg == 0); - while (txh = list_head(&tx->tx_holds)) { + while ((txh = list_head(&tx->tx_holds))) { dnode_t *dn = txh->txh_dnode; list_remove(&tx->tx_holds, txh); @@ -1143,8 +1225,16 @@ dmu_tx_abort(dmu_tx_t *tx) if (dn != NULL) dnode_rele(dn, tx); } + + /* + * Call any registered callbacks with an error code. + */ + if (!list_is_empty(&tx->tx_callbacks)) + dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); + + list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX refcount_destroy_many(&tx->tx_space_written, refcount_count(&tx->tx_space_written)); refcount_destroy_many(&tx->tx_space_freed, @@ -1159,3 +1249,221 @@ dmu_tx_get_txg(dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); return (tx->tx_txg); } + +void +dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) +{ + dmu_tx_callback_t *dcb; + + dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_PUSHPAGE); + + dcb->dcb_func = func; + dcb->dcb_data = data; + + list_insert_tail(&tx->tx_callbacks, dcb); +} + +/* + * Call all the commit callbacks on a list, with a given error code. + */ +void +dmu_tx_do_callbacks(list_t *cb_list, int error) +{ + dmu_tx_callback_t *dcb; + + while ((dcb = list_head(cb_list))) { + list_remove(cb_list, dcb); + dcb->dcb_func(dcb->dcb_data, error); + kmem_free(dcb, sizeof (dmu_tx_callback_t)); + } +} + +/* + * Interface to hold a bunch of attributes. + * used for creating new files. + * attrsize is the total size of all attributes + * to be added during object creation + * + * For updating/adding a single attribute dmu_tx_hold_sa() should be used. + */ + +/* + * hold necessary attribute name for attribute registration. + * should be a very rare case where this is needed. If it does + * happen it would only happen on the first write to the file system. + */ +static void +dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) +{ + int i; + + if (!sa->sa_need_attr_registration) + return; + + for (i = 0; i != sa->sa_num_attrs; i++) { + if (!sa->sa_attr_table[i].sa_registered) { + if (sa->sa_reg_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, + B_TRUE, sa->sa_attr_table[i].sa_name); + else + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, + B_TRUE, sa->sa_attr_table[i].sa_name); + } + } +} + + +void +dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) +{ + dnode_t *dn; + dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, + THT_SPILL, 0, 0); + if (txh == NULL) + return; + + dn = txh->txh_dnode; + + if (dn == NULL) + return; + + /* If blkptr doesn't exist then add space to towrite */ + if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { + txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + } else { + blkptr_t *bp; + + bp = &dn->dn_phys->dn_spill; + if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + bp, bp->blk_birth)) + txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; + else + txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + if (!BP_IS_HOLE(bp)) + txh->txh_space_tounref += SPA_MAXBLOCKSIZE; + } +} + +void +dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) +{ + sa_os_t *sa = tx->tx_objset->os_sa; + + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + else { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) + return; + + (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, + THT_SPILL, 0, 0); +} + +/* + * Hold SA attribute + * + * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) + * + * variable_size is the total size of all variable sized attributes + * passed to this function. It is not the total size of all + * variable size attributes that *may* exist on this object. + */ +void +dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) +{ + uint64_t object; + sa_os_t *sa = tx->tx_objset->os_sa; + + ASSERT(hdl != NULL); + + object = sa_handle_object(hdl); + + dmu_tx_hold_bonus(tx, object); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || + tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + + if (sa->sa_force_spill || may_grow || hdl->sa_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } else { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_have_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } + DB_DNODE_EXIT(db); + } +} + +void +dmu_tx_init(void) +{ + dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", + KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (dmu_tx_ksp != NULL) { + dmu_tx_ksp->ks_data = &dmu_tx_stats; + kstat_install(dmu_tx_ksp); + } +} + +void +dmu_tx_fini(void) +{ + if (dmu_tx_ksp != NULL) { + kstat_delete(dmu_tx_ksp); + dmu_tx_ksp = NULL; + } +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(dmu_tx_create); +EXPORT_SYMBOL(dmu_tx_hold_write); +EXPORT_SYMBOL(dmu_tx_hold_free); +EXPORT_SYMBOL(dmu_tx_hold_zap); +EXPORT_SYMBOL(dmu_tx_hold_bonus); +EXPORT_SYMBOL(dmu_tx_abort); +EXPORT_SYMBOL(dmu_tx_assign); +EXPORT_SYMBOL(dmu_tx_wait); +EXPORT_SYMBOL(dmu_tx_commit); +EXPORT_SYMBOL(dmu_tx_get_txg); +EXPORT_SYMBOL(dmu_tx_callback_register); +EXPORT_SYMBOL(dmu_tx_do_callbacks); +EXPORT_SYMBOL(dmu_tx_hold_spill); +EXPORT_SYMBOL(dmu_tx_hold_sa_create); +EXPORT_SYMBOL(dmu_tx_hold_sa); +#endif