X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fdnode_sync.c;h=76e603753cae747c78d5dddab94594dd9e8f9ab2;hb=c99c90015ece64746e20b74245caca41d1dbefe1;hp=779cfc96f9e3c4eca3ed6c47b5a42bbcb6dbd3b7;hpb=172bb4bd5e4afef721dd4d2972d8680d983f144b;p=zfs.git diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 779cfc9..76e6037 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -18,13 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -79,7 +78,11 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) if (child == NULL) continue; - ASSERT3P(child->db_dnode, ==, dn); +#ifdef DEBUG + DB_DNODE_ENTER(child); + ASSERT3P(DB_DNODE(child), ==, dn); + DB_DNODE_EXIT(child); +#endif /* DEBUG */ if (child->db_parent && child->db_parent != dn->dn_dbuf) { ASSERT(child->db_parent->db_level == db->db_level); ASSERT(child->db_blkptr != @@ -122,7 +125,7 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) if (BP_IS_HOLE(bp)) continue; - bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx); + bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); bzero(bp, sizeof (blkptr_t)); blocks_freed += 1; @@ -138,15 +141,18 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) int off, num; int i, err, epbs; uint64_t txg = tx->tx_txg; + dnode_t *dn; - epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; off = start - (db->db_blkid * 1<=, 0); ASSERT3U(num, >=, 0); ASSERT3U(db->db_level, >, 0); - ASSERT3U(db->db.db_size, ==, 1<db_dnode->dn_phys->dn_indblkshift); + ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); ASSERT(db->db_blkptr != NULL); @@ -158,10 +164,10 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) ASSERT(db->db_level == 1); - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(db->db_dnode, db->db_level-1, + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, db->db_level-1, (db->db_blkid << epbs) + i, TRUE, FTAG, &child); - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; ASSERT(err == 0); @@ -203,6 +209,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) dbuf_rele(child, FTAG); } + DB_DNODE_EXIT(db); } #endif @@ -212,7 +219,7 @@ static int free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, dmu_tx_t *tx) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; blkptr_t *bp; dmu_buf_impl_t *subdb; uint64_t start, end, dbstart, dbend, i; @@ -230,10 +237,12 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, if (db->db_state != DB_CACHED) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); - arc_release(db->db_buf, db); + dbuf_release_bp(db); bp = (blkptr_t *)db->db.db_data; - epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; shift = (db->db_level - 1) * epbs; dbstart = db->db_blkid << epbs; start = blkid >> shift; @@ -256,6 +265,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, blocks_freed = free_blocks(dn, bp, end-start+1, tx); arc_buf_freeze(db->db_buf); ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + DB_DNODE_EXIT(db); return (all ? ALL : blocks_freed); } @@ -264,7 +274,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb); - ASSERT3U(err, ==, 0); + ASSERT0(err); rw_exit(&dn->dn_struct_rwlock); if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) { @@ -275,6 +285,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, } dbuf_rele(subdb, FTAG); } + DB_DNODE_EXIT(db); arc_buf_freeze(db->db_buf); #ifdef ZFS_DEBUG bp -= (end-start)+1; @@ -283,7 +294,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, continue; else if (i == end && !trunc) continue; - ASSERT3U(bp->blk_birth, ==, 0); + ASSERT0(bp->blk_birth); } #endif ASSERT(all || blocks_freed == 0 || db->db_last_dirty); @@ -319,8 +330,8 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); (void) free_blocks(dn, bp + blkid, nblks, tx); if (trunc) { - uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * - (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERTV(uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * + (dn->dn_phys->dn_datablkszsec<dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); ASSERT(off < dn->dn_phys->dn_maxblkid || dn->dn_phys->dn_maxblkid == 0 || @@ -339,7 +350,7 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db); - ASSERT3U(err, ==, 0); + ASSERT0(err); rw_exit(&dn->dn_struct_rwlock); if (free_children(db, blkid, nblks, trunc, tx) == ALL) { @@ -349,8 +360,8 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) dbuf_rele(db, FTAG); } if (trunc) { - uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * - (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERTV(uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * + (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT)); dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); ASSERT(off < dn->dn_phys->dn_maxblkid || dn->dn_phys->dn_maxblkid == 0 || @@ -378,7 +389,11 @@ dnode_evict_dbufs(dnode_t *dn) for (; db != ▮ db = list_head(&dn->dn_dbufs)) { list_remove(&dn->dn_dbufs, db); list_insert_tail(&dn->dn_dbufs, db); - ASSERT3P(db->db_dnode, ==, dn); +#ifdef DEBUG + DB_DNODE_ENTER(db); + ASSERT3P(DB_DNODE(db), ==, dn); + DB_DNODE_EXIT(db); +#endif /* DEBUG */ mutex_enter(&db->db_mtx); if (db->db_state == DB_EVICTING) { @@ -405,9 +420,13 @@ dnode_evict_dbufs(dnode_t *dn) if (evicting) delay(1); pass++; - ASSERT(pass < 100); /* sanity check */ + if ((pass % 100) == 0) + dprintf("Exceeded %d passes evicting dbufs\n", pass); } while (progress); + if (pass >= 100) + dprintf("Required %d passes to evict dbufs\n", pass); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); @@ -422,10 +441,13 @@ dnode_undirty_dbufs(list_t *list) { dbuf_dirty_record_t *dr; - while (dr = list_head(list)) { + while ((dr = list_head(list))) { dmu_buf_impl_t *db = dr->dr_dbuf; uint64_t txg = dr->dr_txg; + if (db->db_level != 0) + dnode_undirty_dbufs(&dr->dt.di.dr_children); + mutex_enter(&db->db_mtx); /* XXX - use dbuf_undirty()? */ list_remove(list, dr); @@ -433,16 +455,12 @@ dnode_undirty_dbufs(list_t *list) db->db_last_dirty = NULL; db->db_dirtycnt -= 1; if (db->db_level == 0) { - ASSERT(db->db_blkid == DB_BONUS_BLKID || + ASSERT(db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_data == db->db_buf); dbuf_unoverride(dr); - mutex_exit(&db->db_mtx); - } else { - mutex_exit(&db->db_mtx); - dnode_undirty_dbufs(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); - dbuf_rele(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); } } @@ -457,7 +475,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) * Our contents should have been freed in dnode_sync() by the * free range record inserted by the caller of dnode_free(). */ - ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0); + ASSERT0(DN_USED_BYTES(dn->dn_phys)); ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); @@ -493,6 +511,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dn->dn_maxblkid = 0; dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; + dn->dn_have_spill = B_FALSE; mutex_exit(&dn->dn_mtx); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); @@ -506,9 +525,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) /* * Write out the dnode's dirty buffers. - * - * NOTE: The dnode is kept in memory by being dirty. Once the - * dirty bit is cleared, it may be evicted. Beware of this! */ void dnode_sync(dnode_t *dn, dmu_tx_t *tx) @@ -517,33 +533,43 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnode_phys_t *dnp = dn->dn_phys; int txgoff = tx->tx_txg & TXG_MASK; list_t *list = &dn->dn_dirty_records[txgoff]; + boolean_t kill_spill = B_FALSE; + ASSERTV(static const dnode_phys_t zerodn = { 0 }); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); + ASSERT(dnp->dn_type != DMU_OT_NONE || + bcmp(dnp, &zerodn, DNODE_SIZE) == 0); DNODE_VERIFY(dn); ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); + if (dmu_objset_userused_enabled(dn->dn_objset) && + !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + mutex_enter(&dn->dn_mtx); + dn->dn_oldused = DN_USED_BYTES(dn->dn_phys); + dn->dn_oldflags = dn->dn_phys->dn_flags; + dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; + mutex_exit(&dn->dn_mtx); + dmu_objset_userquota_get_ids(dn, B_FALSE, tx); + } else { + /* Once we account for it, we should always account for it. */ + ASSERT(!(dn->dn_phys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED)); + } + mutex_enter(&dn->dn_mtx); if (dn->dn_allocated_txg == tx->tx_txg) { /* The dnode is newly allocated or reallocated */ if (dnp->dn_type == DMU_OT_NONE) { /* this is a first alloc, not a realloc */ - /* XXX shouldn't the phys already be zeroed? */ - bzero(dnp, DNODE_CORE_SIZE); dnp->dn_nlevels = 1; + dnp->dn_nblkptr = dn->dn_nblkptr; } - if (dn->dn_nblkptr > dnp->dn_nblkptr) { - /* zero the new blkptrs we are gaining */ - bzero(dnp->dn_blkptr + dnp->dn_nblkptr, - sizeof (blkptr_t) * - (dn->dn_nblkptr - dnp->dn_nblkptr)); - } dnp->dn_type = dn->dn_type; dnp->dn_bonustype = dn->dn_bonustype; dnp->dn_bonuslen = dn->dn_bonuslen; - dnp->dn_nblkptr = dn->dn_nblkptr; } ASSERT(dnp->dn_nlevels > 1 || @@ -556,6 +582,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) SPA_MINBLOCKSIZE) == 0); ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || dn->dn_maxblkid == 0 || list_head(list) != NULL || + avl_last(&dn->dn_ranges[txgoff]) || dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == dnp->dn_datablkszsec); dnp->dn_datablkszsec = @@ -572,6 +599,24 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dn->dn_next_bonuslen[txgoff] = 0; } + if (dn->dn_next_bonustype[txgoff]) { + ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff])); + dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; + dn->dn_next_bonustype[txgoff] = 0; + } + + /* + * We will either remove a spill block when a file is being removed + * or we have been asked to remove it. + */ + if (dn->dn_rm_spillblk[txgoff] || + ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && + dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) { + if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) + kill_spill = B_TRUE; + dn->dn_rm_spillblk[txgoff] = 0; + } + if (dn->dn_next_indblkshift[txgoff]) { ASSERT(dnp->dn_nlevels == 1); dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; @@ -588,8 +633,15 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); + if (kill_spill) { + (void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx); + mutex_enter(&dn->dn_mtx); + dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; + mutex_exit(&dn->dn_mtx); + } + /* process all the "freed" ranges in the file */ - while (rp = avl_last(&dn->dn_ranges[txgoff])) { + while ((rp = avl_last(&dn->dn_ranges[txgoff]))) { dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx); /* grab the mutex so we don't race with dnode_block_freed() */ mutex_enter(&dn->dn_mtx); @@ -603,6 +655,31 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) return; } + if (dn->dn_next_nblkptr[txgoff]) { + /* this should only happen on a realloc */ + ASSERT(dn->dn_allocated_txg == tx->tx_txg); + if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) { + /* zero the new blkptrs we are gaining */ + bzero(dnp->dn_blkptr + dnp->dn_nblkptr, + sizeof (blkptr_t) * + (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr)); +#ifdef ZFS_DEBUG + } else { + int i; + ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr); + /* the blkptrs we are losing better be unallocated */ + for (i = 0; i < dnp->dn_nblkptr; i++) { + if (i >= dn->dn_next_nblkptr[txgoff]) + ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i])); + } +#endif + } + mutex_enter(&dn->dn_mtx); + dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff]; + dn->dn_next_nblkptr[txgoff] = 0; + mutex_exit(&dn->dn_mtx); + } + if (dn->dn_next_nlevels[txgoff]) { dnode_increase_indirection(dn, tx); dn->dn_next_nlevels[txgoff] = 0; @@ -610,7 +687,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dbuf_sync_list(list, tx); - if (dn->dn_object != DMU_META_DNODE_OBJECT) { + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { ASSERT3P(list_head(list), ==, NULL); dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); }