X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=module%2Fzfs%2Fdmu_tx.c;h=fd714135a853329edb9297485a60afb8539cf312;hb=refs%2Fheads%2Frertzinger%2Ffeature-zpool-get--p;hp=bf560e5657c1c2db023e072b4f3e56418e2bd558;hpb=172bb4bd5e4afef721dd4d2972d8680d983f144b;p=zfs.git diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index bf560e5..fd71413 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -19,8 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #include @@ -33,22 +34,43 @@ #include #include /* for fzap_default_block_shift */ #include +#include +#include #include +#include typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); +dmu_tx_stats_t dmu_tx_stats = { + { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, + { "dmu_tx_delay", KSTAT_DATA_UINT64 }, + { "dmu_tx_error", KSTAT_DATA_UINT64 }, + { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, + { "dmu_tx_group", KSTAT_DATA_UINT64 }, + { "dmu_tx_how", KSTAT_DATA_UINT64 }, + { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, + { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, + { "dmu_tx_memory_inflight", KSTAT_DATA_UINT64 }, + { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, + { "dmu_tx_write_limit", KSTAT_DATA_UINT64 }, + { "dmu_tx_quota", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *dmu_tx_ksp; dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { - dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); + dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_PUSHPAGE); tx->tx_dir = dd; if (dd) tx->tx_pool = dd->dd_pool; list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), offsetof(dmu_tx_hold_t, txh_node)); -#ifdef ZFS_DEBUG + list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); +#ifdef DEBUG_DMU_TX refcount_create(&tx->tx_space_written); refcount_create(&tx->tx_space_freed); #endif @@ -58,9 +80,9 @@ dmu_tx_create_dd(dsl_dir_t *dd) dmu_tx_t * dmu_tx_create(objset_t *os) { - dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir); + dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); tx->tx_objset = os; - tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset); + tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); return (tx); } @@ -98,7 +120,7 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, int err; if (object != DMU_NEW_OBJECT) { - err = dnode_hold(os->os, object, tx, &dn); + err = dnode_hold(os, object, tx, &dn); if (err) { tx->tx_err = err; return (NULL); @@ -118,10 +140,10 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, } } - txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); + txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_PUSHPAGE); txh->txh_tx = tx; txh->txh_dnode = dn; -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX txh->txh_type = type; txh->txh_arg1 = arg1; txh->txh_arg2 = arg2; @@ -160,6 +182,50 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) return (err); } +static void +dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, + int level, uint64_t blkid, boolean_t freeable, uint64_t *history) +{ + objset_t *os = dn->dn_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + dmu_buf_impl_t *parent = NULL; + blkptr_t *bp = NULL; + uint64_t space; + + if (level >= dn->dn_nlevels || history[level] == blkid) + return; + + history[level] = blkid; + + space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); + + if (db == NULL || db == dn->dn_dbuf) { + ASSERT(level != 0); + db = NULL; + } else { + ASSERT(DB_DNODE(db) == dn); + ASSERT(db->db_level == level); + ASSERT(db->db.db_size == space); + ASSERT(db->db_blkid == blkid); + bp = db->db_blkptr; + parent = db->db_parent; + } + + freeable = (bp && (freeable || + dsl_dataset_block_freeable(ds, bp, bp->blk_birth))); + + if (freeable) + txh->txh_space_tooverwrite += space; + else + txh->txh_space_towrite += space; + if (bp) + txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp); + + dmu_tx_count_twig(txh, dn, parent, level + 1, + blkid >> epbs, freeable, history); +} + /* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) @@ -168,6 +234,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) uint64_t start, end, i; int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; int err = 0; + int l; if (len == 0) return; @@ -177,18 +244,24 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) min_ibs = DN_MIN_INDBLKSHIFT; max_ibs = DN_MAX_INDBLKSHIFT; - /* - * For i/o error checking, read the first and last level-0 - * blocks (if they are not aligned), and all the level-1 blocks. - */ - if (dn) { + uint64_t history[DN_MAX_LEVELS]; + int nlvls = dn->dn_nlevels; + int delta; + + /* + * For i/o error checking, read the first and last level-0 + * blocks (if they are not aligned), and all the level-1 blocks. + */ if (dn->dn_maxblkid == 0) { - if ((off > 0 || len < dn->dn_datablksz) && - off < dn->dn_datablksz) { + delta = dn->dn_datablksz; + start = (off < dn->dn_datablksz) ? 0 : 1; + end = (off+len <= dn->dn_datablksz) ? 0 : 1; + if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err) goto out; + delta -= off; } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, @@ -213,10 +286,9 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) } /* level-1 blocks */ - if (dn->dn_nlevels > 1) { - start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; - end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; - for (i = start+1; i < end; i++) { + if (nlvls > 1) { + int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (i = (start>>shft)+1; i < end>>shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err) goto out; @@ -226,20 +298,59 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) err = zio_wait(zio); if (err) goto out; + delta = P2NPHASE(off, dn->dn_datablksz); } - } - /* - * If there's more than one block, the blocksize can't change, - * so we can make a more precise estimate. Alternatively, - * if the dnode's ibs is larger than max_ibs, always use that. - * This ensures that if we reduce DN_MAX_INDBLKSHIFT, - * the code will still work correctly on existing pools. - */ - if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) { min_ibs = max_ibs = dn->dn_indblkshift; - if (dn->dn_datablkshift != 0) + if (dn->dn_maxblkid > 0) { + /* + * The blocksize can't change, + * so we can make a more precise estimate. + */ + ASSERT(dn->dn_datablkshift != 0); min_bs = max_bs = dn->dn_datablkshift; + } + + /* + * If this write is not off the end of the file + * we need to account for overwrites/unref. + */ + if (start <= dn->dn_maxblkid) { + for (l = 0; l < DN_MAX_LEVELS; l++) + history[l] = -1ULL; + } + while (start <= dn->dn_maxblkid) { + dmu_buf_impl_t *db; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); + rw_exit(&dn->dn_struct_rwlock); + + if (err) { + txh->txh_tx->tx_err = err; + return; + } + + dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, + history); + dbuf_rele(db, FTAG); + if (++start > end) { + /* + * Account for new indirects appearing + * before this IO gets assigned into a txg. + */ + bits = 64 - min_bs; + epbs = min_ibs - SPA_BLKPTRSHIFT; + for (bits -= epbs * (nlvls - 1); + bits >= 0; bits -= epbs) + txh->txh_fudge += 1ULL << max_ibs; + goto out; + } + off += delta; + if (len >= delta) + len -= delta; + delta = dn->dn_datablksz; + } } /* @@ -262,20 +373,22 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { start >>= epbs; end >>= epbs; - /* - * If we increase the number of levels of indirection, - * we'll need new blkid=0 indirect blocks. If start == 0, - * we're already accounting for that blocks; and if end == 0, - * we can't increase the number of levels beyond that. - */ - if (start != 0 && end != 0) - txh->txh_space_towrite += 1ULL << max_ibs; + ASSERT3U(end, >=, start); txh->txh_space_towrite += (end - start + 1) << max_ibs; + if (start != 0) { + /* + * We also need a new blkid=0 indirect block + * to reference any existing file data. + */ + txh->txh_space_towrite += 1ULL << max_ibs; + } } - ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS); - out: + if (txh->txh_space_towrite + txh->txh_space_tooverwrite > + 2 * DMU_MAX_ACCESS) + err = EFBIG; + if (err) txh->txh_tx->tx_err = err; } @@ -284,14 +397,15 @@ static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { dnode_t *dn = txh->txh_dnode; - dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode; + dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); uint64_t space = mdn->dn_datablksz + ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); if (dn && dn->dn_dbuf->db_blkptr && dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_dbuf->db_blkptr->blk_birth)) { + dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) { txh->txh_space_tooverwrite += space; + txh->txh_space_tounref += space; } else { txh->txh_space_towrite += space; if (dn && dn->dn_dbuf->db_blkptr) @@ -326,6 +440,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; spa_t *spa = txh->txh_tx->tx_pool->dp_spa; int epbs; + uint64_t l0span = 0, nl1blks = 0; if (dn->dn_nlevels == 0) return; @@ -334,7 +449,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) * The struct_rwlock protects us against dn_nlevels * changing, in case (against all odds) we manage to dirty & * sync out the changes after we check for being dirty. - * Also, dbuf_hold_level() wants us to have the struct_rwlock. + * Also, dbuf_hold_impl() wants us to have the struct_rwlock. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; @@ -358,36 +473,23 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) nblks = dn->dn_maxblkid - blkid; } + l0span = nblks; /* save for later use to calc level > 1 overhead */ if (dn->dn_nlevels == 1) { int i; for (i = 0; i < nblks; i++) { blkptr_t *bp = dn->dn_phys->dn_blkptr; ASSERT3U(blkid + i, <, dn->dn_nblkptr); bp += blkid + i; - if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { + if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) { dprintf_bp(bp, "can free old%s", ""); - space += bp_get_dasize(spa, bp); + space += bp_get_dsize(spa, bp); } unref += BP_GET_ASIZE(bp); } + nl1blks = 1; nblks = 0; } - /* - * Add in memory requirements of higher-level indirects. - * This assumes a worst-possible scenario for dn_nlevels. - */ - { - uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); - int level = (dn->dn_nlevels > 1) ? 2 : 1; - - while (level++ < DN_MAX_LEVELS) { - txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; - blkcnt = 1 + (blkcnt >> epbs); - } - ASSERT(blkcnt <= dn->dn_nblkptr); - } - lastblk = blkid + nblks - 1; while (nblks) { dmu_buf_impl_t *dbuf; @@ -422,14 +524,22 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) blkoff = P2PHASE(blkid, epb); tochk = MIN(epb - blkoff, nblks); - dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); - - txh->txh_memory_tohold += dbuf->db.db_size; - if (txh->txh_memory_tohold > DMU_MAX_ACCESS) { - txh->txh_tx->tx_err = E2BIG; - dbuf_rele(dbuf, FTAG); + err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); + if (err) { + txh->txh_tx->tx_err = err; break; } + + txh->txh_memory_tohold += dbuf->db.db_size; + + /* + * We don't check memory_tohold against DMU_MAX_ACCESS because + * memory_tohold is an over-estimation (especially the >L1 + * indirect blocks), so it could fail. Callers should have + * already verified that they will not be holding too much + * memory. + */ + err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); if (err != 0) { txh->txh_tx->tx_err = err; @@ -441,19 +551,44 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) bp += blkoff; for (i = 0; i < tochk; i++) { - if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) { + if (dsl_dataset_block_freeable(ds, &bp[i], + bp[i].blk_birth)) { dprintf_bp(&bp[i], "can free old%s", ""); - space += bp_get_dasize(spa, &bp[i]); + space += bp_get_dsize(spa, &bp[i]); } unref += BP_GET_ASIZE(bp); } dbuf_rele(dbuf, FTAG); + ++nl1blks; blkid += tochk; nblks -= tochk; } rw_exit(&dn->dn_struct_rwlock); + /* + * Add in memory requirements of higher-level indirects. + * This assumes a worst-possible scenario for dn_nlevels and a + * worst-possible distribution of l1-blocks over the region to free. + */ + { + uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs); + int level = 2; + /* + * Here we don't use DN_MAX_LEVEL, but calculate it with the + * given datablkshift and indblkshift. This makes the + * difference between 19 and 8 on large files. + */ + int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) / + (dn->dn_indblkshift - SPA_BLKPTRSHIFT); + + while (level++ < maxlevel) { + txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1) + << dn->dn_indblkshift; + blkcnt = 1 + (blkcnt >> epbs); + } + } + /* account for new level 1 indirect blocks that might show up */ if (skipped > 0) { txh->txh_fudge += skipped << dn->dn_indblkshift; @@ -488,6 +623,8 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) if (len != DMU_OBJECT_END) dmu_tx_count_write(txh, off+len, 1); + dmu_tx_count_dnode(txh); + if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) @@ -530,12 +667,11 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) } } - dmu_tx_count_dnode(txh); dmu_tx_count_free(txh, off, len); } void -dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) +dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) { dmu_tx_hold_t *txh; dnode_t *dn; @@ -562,9 +698,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) return; } - ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 && !add) { + blkptr_t *bp; + /* * If there is only one block (i.e. this is a micro-zap) * and we are not adding anything, the accounting is simple. @@ -579,14 +717,14 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) * Use max block size here, since we don't know how much * the size will change between now and the dbuf dirty call. */ + bp = &dn->dn_phys->dn_blkptr[0]; if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_phys->dn_blkptr[0].blk_birth)) { + bp, bp->blk_birth)) txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; - } else { + else txh->txh_space_towrite += SPA_MAXBLOCKSIZE; - txh->txh_space_tounref += - BP_GET_ASIZE(dn->dn_phys->dn_blkptr); - } + if (!BP_IS_HOLE(bp)) + txh->txh_space_tounref += SPA_MAXBLOCKSIZE; return; } @@ -595,7 +733,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) * access the name in this fat-zap so that we'll check * for i/o errors to the leaf blocks, etc. */ - err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name, + err = zap_lookup(dn->dn_objset, dn->dn_object, name, 8, 0, NULL); if (err == EIO) { tx->tx_err = err; @@ -603,12 +741,8 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) } } - /* - * 3 blocks overwritten: target leaf, ptrtbl block, header block - * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks - */ - dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz, - (3 + (add ? 3 : 0)) << dn->dn_datablkshift); + err = zap_count_write(dn->dn_objset, dn->dn_object, name, add, + &txh->txh_space_towrite, &txh->txh_space_tooverwrite); /* * If the modified blocks are scattered to the four winds, @@ -616,7 +750,10 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) */ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) - txh->txh_space_towrite += 3 << dn->dn_indblkshift; + if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj) + txh->txh_space_towrite += 3 << dn->dn_indblkshift; + else + txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift; } void @@ -636,12 +773,13 @@ void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) { dmu_tx_hold_t *txh; + ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE, space, 0); - - txh->txh_space_towrite += space; + if (txh) + txh->txh_space_towrite += space; } int @@ -670,28 +808,35 @@ dmu_tx_holds(dmu_tx_t *tx, uint64_t object) return (holds); } -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX void dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { dmu_tx_hold_t *txh; int match_object = FALSE, match_offset = FALSE; - dnode_t *dn = db->db_dnode; + dnode_t *dn; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(dn != NULL); ASSERT(tx->tx_txg != 0); - ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os); + ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); - if (tx->tx_anyobj) + if (tx->tx_anyobj) { + DB_DNODE_EXIT(db); return; + } /* XXX No checking on the meta dnode for now */ - if (db->db.db_object == DMU_META_DNODE_OBJECT) + if (db->db.db_object == DMU_META_DNODE_OBJECT) { + DB_DNODE_EXIT(db); return; + } for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { - ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) match_object = TRUE; if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { @@ -716,10 +861,11 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) match_offset = TRUE; /* * We will let this hold work for the bonus - * buffer so that we don't need to hold it - * when creating a new object. + * or spill buffer so that we don't need to + * hold it when creating a new object. */ - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID || + blkid == DMU_SPILL_BLKID) match_offset = TRUE; /* * They might have to increase nlevels, @@ -740,8 +886,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; + case THT_SPILL: + if (blkid == DMU_SPILL_BLKID) + match_offset = TRUE; + break; case THT_BONUS: - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID) match_offset = TRUE; break; case THT_ZAP: @@ -754,9 +904,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) ASSERT(!"bad txh_type"); } } - if (match_object && match_offset) + if (match_object && match_offset) { + DB_DNODE_EXIT(db); return; + } } + DB_DNODE_EXIT(db); panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", (u_longlong_t)db->db.db_object, db->db_level, (u_longlong_t)db->db_blkid); @@ -771,12 +924,16 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) uint64_t memory, asize, fsize, usize; uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; - ASSERT3U(tx->tx_txg, ==, 0); + ASSERT0(tx->tx_txg); - if (tx->tx_err) + if (tx->tx_err) { + DMU_TX_STAT_BUMP(dmu_tx_error); return (tx->tx_err); + } if (spa_suspended(spa)) { + DMU_TX_STAT_BUMP(dmu_tx_suspended); + /* * If the user has indicated a blocking failure mode * then return ERESTART which will block in dmu_tx_wait(). @@ -811,6 +968,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) if (dn->dn_assigned_txg == tx->tx_txg - 1) { mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = txh; + DMU_TX_STAT_BUMP(dmu_tx_group); return (ERESTART); } if (dn->dn_assigned_txg == 0) @@ -831,15 +989,17 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) * NB: This check must be after we've held the dnodes, so that * the dmu_tx_unassign() logic will work properly */ - if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) + if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) { + DMU_TX_STAT_BUMP(dmu_tx_how); return (ERESTART); + } /* * If a snapshot has been taken since we made our estimates, * assume that we won't be able to free or overwrite anything. */ if (tx->tx_objset && - dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) > + dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > tx->tx_lastsnap_txg) { towrite += tooverwrite; tooverwrite = tofree = 0; @@ -854,7 +1014,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) /* calculate memory footprint estimate */ memory = towrite + tooverwrite + tohold; -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX /* * Add in 'tohold' to account for our dirty holds on this memory * XXX - the "fudge" factor is to account for skipped blocks that @@ -874,6 +1034,8 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) return (err); } + DMU_TX_STAT_BUMP(dmu_tx_assigned); + return (0); } @@ -928,12 +1090,15 @@ dmu_tx_unassign(dmu_tx_t *tx) int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) { + hrtime_t before, after; int err; ASSERT(tx->tx_txg == 0); ASSERT(txg_how != 0); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); + before = gethrtime(); + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); @@ -945,6 +1110,11 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) txg_rele_to_quiesce(&tx->tx_txgh); + after = gethrtime(); + + dsl_pool_tx_assign_add_usecs(tx->tx_pool, + (after - before) / NSEC_PER_USEC); + return (0); } @@ -978,7 +1148,7 @@ dmu_tx_wait(dmu_tx_t *tx) void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) { -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX if (tx->tx_dir == NULL || delta == 0) return; @@ -999,7 +1169,7 @@ dmu_tx_commit(dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); - while (txh = list_head(&tx->tx_holds)) { + while ((txh = list_head(&tx->tx_holds))) { dnode_t *dn = txh->txh_dnode; list_remove(&tx->tx_holds, txh); @@ -1020,10 +1190,15 @@ dmu_tx_commit(dmu_tx_t *tx) if (tx->tx_tempreserve_cookie) dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); + if (!list_is_empty(&tx->tx_callbacks)) + txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); + if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); + + list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", tx->tx_space_towrite, refcount_count(&tx->tx_space_written), tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); @@ -1042,7 +1217,7 @@ dmu_tx_abort(dmu_tx_t *tx) ASSERT(tx->tx_txg == 0); - while (txh = list_head(&tx->tx_holds)) { + while ((txh = list_head(&tx->tx_holds))) { dnode_t *dn = txh->txh_dnode; list_remove(&tx->tx_holds, txh); @@ -1050,8 +1225,16 @@ dmu_tx_abort(dmu_tx_t *tx) if (dn != NULL) dnode_rele(dn, tx); } + + /* + * Call any registered callbacks with an error code. + */ + if (!list_is_empty(&tx->tx_callbacks)) + dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); + + list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); -#ifdef ZFS_DEBUG +#ifdef DEBUG_DMU_TX refcount_destroy_many(&tx->tx_space_written, refcount_count(&tx->tx_space_written)); refcount_destroy_many(&tx->tx_space_freed, @@ -1066,3 +1249,221 @@ dmu_tx_get_txg(dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); return (tx->tx_txg); } + +void +dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) +{ + dmu_tx_callback_t *dcb; + + dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_PUSHPAGE); + + dcb->dcb_func = func; + dcb->dcb_data = data; + + list_insert_tail(&tx->tx_callbacks, dcb); +} + +/* + * Call all the commit callbacks on a list, with a given error code. + */ +void +dmu_tx_do_callbacks(list_t *cb_list, int error) +{ + dmu_tx_callback_t *dcb; + + while ((dcb = list_head(cb_list))) { + list_remove(cb_list, dcb); + dcb->dcb_func(dcb->dcb_data, error); + kmem_free(dcb, sizeof (dmu_tx_callback_t)); + } +} + +/* + * Interface to hold a bunch of attributes. + * used for creating new files. + * attrsize is the total size of all attributes + * to be added during object creation + * + * For updating/adding a single attribute dmu_tx_hold_sa() should be used. + */ + +/* + * hold necessary attribute name for attribute registration. + * should be a very rare case where this is needed. If it does + * happen it would only happen on the first write to the file system. + */ +static void +dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) +{ + int i; + + if (!sa->sa_need_attr_registration) + return; + + for (i = 0; i != sa->sa_num_attrs; i++) { + if (!sa->sa_attr_table[i].sa_registered) { + if (sa->sa_reg_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, + B_TRUE, sa->sa_attr_table[i].sa_name); + else + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, + B_TRUE, sa->sa_attr_table[i].sa_name); + } + } +} + + +void +dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) +{ + dnode_t *dn; + dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, + THT_SPILL, 0, 0); + if (txh == NULL) + return; + + dn = txh->txh_dnode; + + if (dn == NULL) + return; + + /* If blkptr doesn't exist then add space to towrite */ + if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { + txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + } else { + blkptr_t *bp; + + bp = &dn->dn_phys->dn_spill; + if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + bp, bp->blk_birth)) + txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; + else + txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + if (!BP_IS_HOLE(bp)) + txh->txh_space_tounref += SPA_MAXBLOCKSIZE; + } +} + +void +dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) +{ + sa_os_t *sa = tx->tx_objset->os_sa; + + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + else { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) + return; + + (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, + THT_SPILL, 0, 0); +} + +/* + * Hold SA attribute + * + * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) + * + * variable_size is the total size of all variable sized attributes + * passed to this function. It is not the total size of all + * variable size attributes that *may* exist on this object. + */ +void +dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) +{ + uint64_t object; + sa_os_t *sa = tx->tx_objset->os_sa; + + ASSERT(hdl != NULL); + + object = sa_handle_object(hdl); + + dmu_tx_hold_bonus(tx, object); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || + tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + + if (sa->sa_force_spill || may_grow || hdl->sa_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } else { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_have_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } + DB_DNODE_EXIT(db); + } +} + +void +dmu_tx_init(void) +{ + dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", + KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (dmu_tx_ksp != NULL) { + dmu_tx_ksp->ks_data = &dmu_tx_stats; + kstat_install(dmu_tx_ksp); + } +} + +void +dmu_tx_fini(void) +{ + if (dmu_tx_ksp != NULL) { + kstat_delete(dmu_tx_ksp); + dmu_tx_ksp = NULL; + } +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(dmu_tx_create); +EXPORT_SYMBOL(dmu_tx_hold_write); +EXPORT_SYMBOL(dmu_tx_hold_free); +EXPORT_SYMBOL(dmu_tx_hold_zap); +EXPORT_SYMBOL(dmu_tx_hold_bonus); +EXPORT_SYMBOL(dmu_tx_abort); +EXPORT_SYMBOL(dmu_tx_assign); +EXPORT_SYMBOL(dmu_tx_wait); +EXPORT_SYMBOL(dmu_tx_commit); +EXPORT_SYMBOL(dmu_tx_get_txg); +EXPORT_SYMBOL(dmu_tx_callback_register); +EXPORT_SYMBOL(dmu_tx_do_callbacks); +EXPORT_SYMBOL(dmu_tx_hold_spill); +EXPORT_SYMBOL(dmu_tx_hold_sa_create); +EXPORT_SYMBOL(dmu_tx_hold_sa); +#endif