Minor tweak to update script
[zfs.git] / zfs / lib / libzpool / dmu_traverse.c
index 6d57232..5124014 100644 (file)
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident  "@(#)dmu_traverse.c     1.7     08/04/01 SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_impl.h>
-#include <sys/zvol.h>
-
-#define        BP_SPAN_SHIFT(level, width)     ((level) * (width))
-
-#define        BP_EQUAL(b1, b2)                                \
-       (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \
-       (b1)->blk_birth == (b2)->blk_birth)
-
-/*
- * Compare two bookmarks.
- *
- * For ADVANCE_PRE, the visitation order is:
- *
- *     objset 0, 1, 2, ..., ZB_MAXOBJSET.
- *     object 0, 1, 2, ..., ZB_MAXOBJECT.
- *     blkoff 0, 1, 2, ...
- *     level ZB_MAXLEVEL, ..., 2, 1, 0.
- *
- * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
- * ordering vector is:
- *
- *     < objset, object, blkoff, -level >
- *
- * For ADVANCE_POST, the starting offsets aren't sequential but ending
- * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
- * The visitation order is:
- *
- *     objset 1, 2, ..., ZB_MAXOBJSET, 0.
- *     object 1, 2, ..., ZB_MAXOBJECT, 0.
- *     blkoff 1, 2, ...
- *     level 0, 1, 2, ..., ZB_MAXLEVEL.
- *
- * and thus a valid ordering vector is:
- *
- *     < objset - 1, object - 1, blkoff, level >
- *
- * Both orderings can be expressed as:
- *
- *     < objset + bias, object + bias, blkoff, level ^ bias >
- *
- * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
- * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
- *
- * Special case: an objset's osphys is represented as level -1 of object 0.
- * It is always either the very first or very last block we visit in an objset.
- * Therefore, if either bookmark's level is -1, level alone determines order.
- */
-static int
-compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
-    int advance)
-{
-       int bias = (advance & ADVANCE_PRE) ? 0 : -1;
-       uint64_t sblkoff, eblkoff;
-       int slevel, elevel, wshift;
-
-       if (szb->zb_objset + bias < ezb->zb_objset + bias)
-               return (-1);
-
-       if (szb->zb_objset + bias > ezb->zb_objset + bias)
-               return (1);
-
-       slevel = szb->zb_level;
-       elevel = ezb->zb_level;
-
-       if ((slevel | elevel) < 0)
-               return ((slevel ^ bias) - (elevel ^ bias));
-
-       if (szb->zb_object + bias < ezb->zb_object + bias)
-               return (-1);
-
-       if (szb->zb_object + bias > ezb->zb_object + bias)
-               return (1);
-
-       if (dnp == NULL)
-               return (0);
-
-       wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
-
-       sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
-       eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
-
-       if (sblkoff < eblkoff)
-               return (-1);
-
-       if (sblkoff > eblkoff)
-               return (1);
-
-       return ((elevel ^ bias) - (slevel ^ bias));
-}
-
-#define        SET_BOOKMARK(zb, objset, object, level, blkid)  \
-{                                                      \
-       (zb)->zb_objset = objset;                       \
-       (zb)->zb_object = object;                       \
-       (zb)->zb_level = level;                         \
-       (zb)->zb_blkid = blkid;                         \
-}
-
-#define        SET_BOOKMARK_LB(zb, level, blkid)               \
-{                                                      \
-       (zb)->zb_level = level;                         \
-       (zb)->zb_blkid = blkid;                         \
-}
-
-static int
-advance_objset(zseg_t *zseg, uint64_t objset, int advance)
-{
-       zbookmark_t *zb = &zseg->seg_start;
-
-       if (advance & ADVANCE_PRE) {
-               if (objset >= ZB_MAXOBJSET)
-                       return (ERANGE);
-               SET_BOOKMARK(zb, objset, 0, -1, 0);
-       } else {
-               if (objset >= ZB_MAXOBJSET)
-                       objset = 0;
-               SET_BOOKMARK(zb, objset, 1, 0, 0);
-       }
-
-       if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
-               return (ERANGE);
-
-       return (EAGAIN);
-}
-
-static int
-advance_object(zseg_t *zseg, uint64_t object, int advance)
-{
-       zbookmark_t *zb = &zseg->seg_start;
-
-       if (advance & ADVANCE_PRE) {
-               if (object >= ZB_MAXOBJECT) {
-                       SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
-               } else {
-                       SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
-               }
-       } else {
-               if (zb->zb_object == 0) {
-                       SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
-               } else {
-                       if (object >= ZB_MAXOBJECT)
-                               object = 0;
-                       SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
-               }
-       }
-
-       if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
-               return (ERANGE);
-
-       return (EAGAIN);
-}
-
-static int
-advance_from_osphys(zseg_t *zseg, int advance)
-{
-       zbookmark_t *zb = &zseg->seg_start;
-
-       ASSERT(zb->zb_object == 0);
-       ASSERT(zb->zb_level == -1);
-       ASSERT(zb->zb_blkid == 0);
-
-       if (advance & ADVANCE_PRE) {
-               SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
-       } else {
-               if (zb->zb_objset == 0)
-                       return (ERANGE);
-               SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
-       }
-
-       if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
-               return (ERANGE);
-
-       return (EAGAIN);
-}
-
-static int
-advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
-{
-       zbookmark_t *zb = &zseg->seg_start;
-       int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
-       int maxlevel = dnp->dn_nlevels - 1;
-       int level = zb->zb_level;
-       uint64_t blkid = zb->zb_blkid;
-
-       if (advance & ADVANCE_PRE) {
-               if (level > 0 && rc == 0) {
-                       level--;
-                       blkid <<= wshift;
-               } else {
-                       blkid++;
-
-                       if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
-                           dnp->dn_maxblkid)
-                               return (ERANGE);
-
-                       while (level < maxlevel) {
-                               if (P2PHASE(blkid, 1ULL << wshift))
-                                       break;
-                               blkid >>= wshift;
-                               level++;
-                       }
-               }
-       } else {
-               if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
-                       blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
-                       level = 0;
-               } else {
-                       blkid >>= wshift;
-                       level++;
-               }
-
-               while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
-                   dnp->dn_maxblkid) {
-                       if (level == maxlevel)
-                               return (ERANGE);
-                       blkid >>= wshift;
-                       level++;
-               }
-       }
-       SET_BOOKMARK_LB(zb, level, blkid);
-
-       if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
-               return (ERANGE);
-
-       return (EAGAIN);
-}
-
-/*
- * The traverse_callback function will call the function specified in th_func.
- * In the event of an error the callee, specified by th_func, must return
- * one of the following errors:
- *
- *     EINTR           - Indicates that the callee wants the traversal to
- *                       abort immediately.
- *     ERESTART        - The callee has acknowledged the error and would
- *                       like to continue.
- */
-static int
-traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
-{
-       /*
-        * Before we issue the callback, prune against maxtxg.
-        *
-        * We prune against mintxg before we get here because it's a big win.
-        * If a given block was born in txg 37, then we know that the entire
-        * subtree below that block must have been born in txg 37 or earlier.
-        * We can therefore lop off huge branches of the tree as we go.
-        *
-        * There's no corresponding optimization for maxtxg because knowing
-        * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
-        * children.  In fact, the copy-on-write design of ZFS ensures that
-        * top-level blocks will pretty much always be new.
-        *
-        * Therefore, in the name of simplicity we don't prune against
-        * maxtxg until the last possible moment -- that being right now.
-        */
-       if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
-               return (0);
-
-       /*
-        * Debugging: verify that the order we visit things agrees with the
-        * order defined by compare_bookmark().  We don't check this for
-        * log blocks because there's no defined ordering for them; they're
-        * always visited (or not) as part of visiting the objset_phys_t.
-        */
-       if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
-               zbookmark_t *zb = &bc->bc_bookmark;
-               zbookmark_t *szb = &zseg->seg_start;
-               zbookmark_t *ezb = &zseg->seg_end;
-               zbookmark_t *lzb = &th->th_lastcb;
-               dnode_phys_t *dnp = bc->bc_dnode;
-
-               ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
-               ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
-               ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
-                   lzb->zb_level == ZB_NO_LEVEL);
-               *lzb = *zb;
-       }
-
-       th->th_callbacks++;
-       return (th->th_func(bc, th->th_spa, th->th_arg));
-}
-
-static int
-traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
-       dnode_phys_t *dnp)
-{
-       zbookmark_t *zb = &bc->bc_bookmark;
-       int error;
-
-       th->th_hits++;
-
-       bc->bc_dnode = dnp;
-       bc->bc_errno = 0;
-
-       if (BP_EQUAL(&bc->bc_blkptr, bp))
-               return (0);
-
-       bc->bc_blkptr = *bp;
-
-       if (bc->bc_data == NULL)
-               return (0);
-
-       if (BP_IS_HOLE(bp)) {
-               ASSERT(th->th_advance & ADVANCE_HOLES);
-               return (0);
-       }
-
-       if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
-               error = EIO;
-       } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
-               error = 0;
-               th->th_arc_hits++;
-       } else {
-               error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
-                   BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
-                   th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
-
-               if (BP_SHOULD_BYTESWAP(bp) && error == 0)
-                       (zb->zb_level > 0 ? byteswap_uint64_array :
-                           dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
-                           BP_GET_LSIZE(bp));
-               th->th_reads++;
-       }
-
-       if (error) {
-               bc->bc_errno = error;
-               error = traverse_callback(th, NULL, bc);
-               ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
-               bc->bc_blkptr.blk_birth = -1ULL;
-       }
-
-       dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
-           bc - &th->th_cache[0][0], error,
-           zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
-
-       return (error);
+#include <sys/callb.h>
+
+#define        SET_BOOKMARK(zb, objset, object, level, blkid)  \
+{                                                       \
+       (zb)->zb_objset = objset;                       \
+       (zb)->zb_object = object;                       \
+       (zb)->zb_level = level;                         \
+       (zb)->zb_blkid = blkid;                         \
 }
 
-static int
-find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
-{
-       zbookmark_t *zb = &zseg->seg_start;
-       traverse_blk_cache_t *bc;
-       blkptr_t *bp = dnp->dn_blkptr;
-       int i, first, level;
-       int nbp = dnp->dn_nblkptr;
-       int minlevel = zb->zb_level;
-       int maxlevel = dnp->dn_nlevels - 1;
-       int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
-       int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
-       uint64_t blkid = zb->zb_blkid >> bp_shift;
-       int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
-       int rc;
-
-       if (minlevel > maxlevel || blkid >= nbp)
-               return (ERANGE);
-
-       for (level = maxlevel; level >= minlevel; level--) {
-               first = P2PHASE(blkid, 1ULL << wshift);
-
-               for (i = first; i < nbp; i++)
-                       if (bp[i].blk_birth > zseg->seg_mintxg ||
-                           BP_IS_HOLE(&bp[i]) && do_holes)
-                               break;
-
-               if (i != first) {
-                       i--;
-                       SET_BOOKMARK_LB(zb, level, blkid + (i - first));
-                       return (ENOTBLK);
-               }
-
-               bc = &th->th_cache[depth][level];
-
-               SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
-                   level, blkid);
-
-               if (rc = traverse_read(th, bc, bp + i, dnp)) {
-                       if (rc != EAGAIN) {
-                               SET_BOOKMARK_LB(zb, level, blkid);
-                       }
-                       return (rc);
-               }
-
-               if (BP_IS_HOLE(&bp[i])) {
-                       SET_BOOKMARK_LB(zb, level, blkid);
-                       th->th_lastcb.zb_level = ZB_NO_LEVEL;
-                       return (0);
-               }
-
-               nbp = 1 << wshift;
-               bp = bc->bc_data;
-               bp_shift -= wshift;
-               blkid = zb->zb_blkid >> bp_shift;
-       }
-
-       return (0);
-}
-
-static int
-get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
-    uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
-{
-       zseg_t zseg;
-       zbookmark_t *zb = &zseg.seg_start;
-       uint64_t object = *objectp;
-       int i, rc;
-
-       SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
-       SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
-
-       zseg.seg_mintxg = txg;
-       zseg.seg_maxtxg = -1ULL;
-
-       for (;;) {
-               rc = find_block(th, &zseg, mdn, depth);
-
-               if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
-                       break;
-
-               if (rc == 0 && zb->zb_level == 0) {
-                       dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
-                       for (i = 0; i < DNODES_PER_BLOCK; i++) {
-                               object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
-                               if (object >= *objectp &&
-                                   dnp[i].dn_type != DMU_OT_NONE &&
-                                   (type == -1 || dnp[i].dn_type == type)) {
-                                       *objectp = object;
-                                       *dnpp = &dnp[i];
-                                       return (0);
-                               }
-                       }
-               }
-
-               rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
-
-               if (rc == ERANGE)
-                       break;
-       }
-
-       if (rc == ERANGE)
-               *objectp = ZB_MAXOBJECT;
-
-       return (rc);
-}
+struct prefetch_data {
+       kmutex_t pd_mtx;
+       kcondvar_t pd_cv;
+       int pd_blks_max;
+       int pd_blks_fetched;
+       int pd_flags;
+       boolean_t pd_cancel;
+       boolean_t pd_exited;
+};
+
+struct traverse_data {
+       spa_t *td_spa;
+       uint64_t td_objset;
+       blkptr_t *td_rootbp;
+       uint64_t td_min_txg;
+       int td_flags;
+       struct prefetch_data *td_pfd;
+       blkptr_cb_t *td_func;
+       void *td_arg;
+};
 
 /* ARGSUSED */
 static void
 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
-       traverse_handle_t *th = arg;
-       traverse_blk_cache_t *bc = &th->th_zil_cache;
-       zbookmark_t *zb = &bc->bc_bookmark;
-       zseg_t *zseg = list_head(&th->th_seglist);
+       struct traverse_data *td = arg;
+       zbookmark_t zb;
 
-       if (bp->blk_birth <= zseg->seg_mintxg)
+       if (bp->blk_birth == 0)
                return;
 
-       if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
-               zb->zb_object = 0;
-               zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
-               bc->bc_blkptr = *bp;
-               (void) traverse_callback(th, zseg, bc);
-       }
+       if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
+               return;
+
+       zb.zb_objset = td->td_objset;
+       zb.zb_object = 0;
+       zb.zb_level = -1;
+       zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+       VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
 }
 
 /* ARGSUSED */
 static void
 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 {
-       traverse_handle_t *th = arg;
-       traverse_blk_cache_t *bc = &th->th_zil_cache;
-       zbookmark_t *zb = &bc->bc_bookmark;
-       zseg_t *zseg = list_head(&th->th_seglist);
+       struct traverse_data *td = arg;
 
        if (lrc->lrc_txtype == TX_WRITE) {
                lr_write_t *lr = (lr_write_t *)lrc;
                blkptr_t *bp = &lr->lr_blkptr;
+               zbookmark_t zb;
 
-               if (bp->blk_birth <= zseg->seg_mintxg)
+               if (bp->blk_birth == 0)
                        return;
 
-               if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
-                       zb->zb_object = lr->lr_foid;
-                       zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
-                       bc->bc_blkptr = *bp;
-                       (void) traverse_callback(th, zseg, bc);
-               }
+               if (claim_txg == 0 || bp->blk_birth < claim_txg)
+                       return;
+
+               zb.zb_objset = td->td_objset;
+               zb.zb_object = lr->lr_foid;
+               zb.zb_level = BP_GET_LEVEL(bp);
+               zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
+               VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
        }
 }
 
 static void
-traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
+traverse_zil(struct traverse_data *td, zil_header_t *zh)
 {
-       spa_t *spa = th->th_spa;
-       dsl_pool_t *dp = spa_get_dsl(spa);
-       objset_phys_t *osphys = bc->bc_data;
-       zil_header_t *zh = &osphys->os_zil_header;
        uint64_t claim_txg = zh->zh_claim_txg;
        zilog_t *zilog;
 
-       ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
-       ASSERT(bc->bc_bookmark.zb_level == -1);
-
        /*
         * We only want to visit blocks that have been claimed but not yet
         * replayed (or, in read-only mode, blocks that *would* be claimed).
@@ -546,372 +122,285 @@ traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
        if (claim_txg == 0 && (spa_mode & FWRITE))
                return;
 
-       th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
-
-       zilog = zil_alloc(dp->dp_meta_objset, zh);
+       zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
 
-       (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
+       (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
            claim_txg);
 
        zil_free(zilog);
 }
 
 static int
-traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
+traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
+    arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
 {
-       zbookmark_t *zb = &zseg->seg_start;
-       traverse_blk_cache_t *bc;
-       dnode_phys_t *dn, *dn_tmp;
-       int worklimit = 100;
-       int rc;
-
-       dprintf("<%llu, %llu, %d, %llx>\n",
-           zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
-
-       bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
-       dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
-
-       SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
-
-       rc = traverse_read(th, bc, mosbp, dn);
-
-       if (rc)         /* If we get ERESTART, we've got nowhere left to go */
-               return (rc == ERESTART ? EINTR : rc);
-
-       ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
-
-       if (zb->zb_objset != 0) {
-               uint64_t objset = zb->zb_objset;
-               dsl_dataset_phys_t *dsp;
-
-               rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
-                   DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
-
-               if (objset != zb->zb_objset)
-                       rc = advance_objset(zseg, objset, th->th_advance);
-
-               if (rc != 0)
-                       return (rc);
-
-               dsp = DN_BONUS(dn_tmp);
-
-               bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
-               dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
-
-               SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
-
-               /*
-                * If we're traversing an open snapshot, we know that it
-                * can't be deleted (because it's open) and it can't change
-                * (because it's a snapshot).  Therefore, once we've gotten
-                * from the uberblock down to the snapshot's objset_phys_t,
-                * we no longer need to synchronize with spa_sync(); we're
-                * traversing a completely static block tree from here on.
-                */
-               if (th->th_advance & ADVANCE_NOLOCK) {
-                       ASSERT(th->th_locked);
-                       rw_exit(spa_traverse_rwlock(th->th_spa));
-                       th->th_locked = 0;
-               }
-
-               rc = traverse_read(th, bc, &dsp->ds_bp, dn);
-
-               if (rc != 0) {
-                       if (rc == ERESTART)
-                               rc = advance_objset(zseg, zb->zb_objset + 1,
-                                   th->th_advance);
-                       return (rc);
-               }
-
-               if (th->th_advance & ADVANCE_PRUNE)
-                       zseg->seg_mintxg =
-                           MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
+       zbookmark_t czb;
+       int err = 0;
+       arc_buf_t *buf = NULL;
+       struct prefetch_data *pd = td->td_pfd;
+
+       if (bp->blk_birth == 0) {
+               err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg);
+               return (err);
        }
 
-       if (zb->zb_level == -1) {
-               ASSERT(zb->zb_object == 0);
-               ASSERT(zb->zb_blkid == 0);
-               ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
-
-               if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
-                       rc = traverse_callback(th, zseg, bc);
-                       if (rc) {
-                               ASSERT(rc == EINTR);
-                               return (rc);
-                       }
-                       if ((th->th_advance & ADVANCE_ZIL) &&
-                           zb->zb_objset != 0)
-                               traverse_zil(th, bc);
-               }
+       if (bp->blk_birth <= td->td_min_txg)
+               return (0);
 
-               return (advance_from_osphys(zseg, th->th_advance));
+       if (pd && !pd->pd_exited &&
+           ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
+           BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
+               mutex_enter(&pd->pd_mtx);
+               ASSERT(pd->pd_blks_fetched >= 0);
+               while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
+                       cv_wait(&pd->pd_cv, &pd->pd_mtx);
+               pd->pd_blks_fetched--;
+               cv_broadcast(&pd->pd_cv);
+               mutex_exit(&pd->pd_mtx);
        }
 
-       if (zb->zb_object != 0) {
-               uint64_t object = zb->zb_object;
-
-               rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
-                   zseg->seg_mintxg, -1, ZB_MDN_CACHE);
-
-               if (object != zb->zb_object)
-                       rc = advance_object(zseg, object, th->th_advance);
-
-               if (rc != 0)
-                       return (rc);
-
-               dn = dn_tmp;
+       if (td->td_flags & TRAVERSE_PRE) {
+               err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+               if (err)
+                       return (err);
        }
 
-       if (zb->zb_level == ZB_MAXLEVEL)
-               zb->zb_level = dn->dn_nlevels - 1;
-
-       for (;;) {
-               rc = find_block(th, zseg, dn, ZB_DN_CACHE);
-
-               if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
-                       break;
-
-               if (rc == 0) {
-                       bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
-                       ASSERT(bc->bc_dnode == dn);
-                       ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
-                       rc = traverse_callback(th, zseg, bc);
-                       if (rc) {
-                               ASSERT(rc == EINTR);
-                               return (rc);
-                       }
-                       if (BP_IS_HOLE(&bc->bc_blkptr)) {
-                               ASSERT(th->th_advance & ADVANCE_HOLES);
-                               rc = ENOTBLK;
+       if (BP_GET_LEVEL(bp) > 0) {
+               uint32_t flags = ARC_WAIT;
+               int i;
+               blkptr_t *cbp;
+               int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+
+               err = arc_read(NULL, td->td_spa, bp, pbuf,
+                   arc_getbuf_func, &buf,
+                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+               if (err)
+                       return (err);
+
+               /* recursively visitbp() blocks below this */
+               cbp = buf->b_data;
+               for (i = 0; i < epb; i++, cbp++) {
+                       SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+                           zb->zb_level - 1,
+                           zb->zb_blkid * epb + i);
+                       err = traverse_visitbp(td, dnp, buf, cbp, &czb);
+                       if (err)
+                               break;
+               }
+       } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+               uint32_t flags = ARC_WAIT;
+               int i, j;
+               int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+               err = arc_read(NULL, td->td_spa, bp, pbuf,
+                   arc_getbuf_func, &buf,
+                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+               if (err)
+                       return (err);
+
+               /* recursively visitbp() blocks below this */
+               dnp = buf->b_data;
+               for (i = 0; i < epb && err == 0; i++, dnp++) {
+                       for (j = 0; j < dnp->dn_nblkptr; j++) {
+                               SET_BOOKMARK(&czb, zb->zb_objset,
+                                   zb->zb_blkid * epb + i,
+                                   dnp->dn_nlevels - 1, j);
+                               err = traverse_visitbp(td, dnp, buf,
+                                   (blkptr_t *)&dnp->dn_blkptr[j], &czb);
+                               if (err)
+                                       break;
                        }
                }
-
-               rc = advance_block(zseg, dn, rc, th->th_advance);
-
-               if (rc == ERANGE)
-                       break;
-
+       } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+               uint32_t flags = ARC_WAIT;
+               objset_phys_t *osp;
+               int j;
+
+               err = arc_read_nolock(NULL, td->td_spa, bp,
+                   arc_getbuf_func, &buf,
+                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+               if (err)
+                       return (err);
+
+               osp = buf->b_data;
                /*
-                * Give spa_sync() a chance to run.
+                * traverse_zil is just here for zdb's leak checking.
+                * For other consumers, there will be no ZIL blocks.
                 */
-               if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
-                       th->th_syncs++;
-                       return (EAGAIN);
+               traverse_zil(td, &osp->os_zil_header);
+
+               for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
+                       SET_BOOKMARK(&czb, zb->zb_objset, 0,
+                           osp->os_meta_dnode.dn_nlevels - 1, j);
+                       err = traverse_visitbp(td, &osp->os_meta_dnode, buf,
+                           (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j],
+                           &czb);
+                       if (err)
+                               break;
                }
-
-               if (--worklimit == 0)
-                       return (EAGAIN);
        }
 
-       if (rc == ERANGE)
-               rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
-
-       return (rc);
-}
-
-/*
- * It is the caller's responsibility to ensure that the dsl_dataset_t
- * doesn't go away during traversal.
- */
-int
-traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
-    blkptr_cb_t func, void *arg)
-{
-       spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
-       traverse_handle_t *th;
-       int err;
-
-       th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
-
-       traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
+       if (buf)
+               (void) arc_buf_remove_ref(buf, &buf);
 
-       while ((err = traverse_more(th)) == EAGAIN)
-               continue;
+       if (err == 0 && (td->td_flags & TRAVERSE_POST))
+               err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
 
-       traverse_fini(th);
        return (err);
 }
 
-int
-traverse_zvol(objset_t *os, int advance,  blkptr_cb_t func, void *arg)
+/* ARGSUSED */
+static int
+traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
+    const dnode_phys_t *dnp, void *arg)
 {
-       spa_t *spa = dmu_objset_spa(os);
-       traverse_handle_t *th;
-       int err;
+       struct prefetch_data *pfd = arg;
+       uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 
-       th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL);
+       ASSERT(pfd->pd_blks_fetched >= 0);
+       if (pfd->pd_cancel)
+               return (EINTR);
 
-       traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ);
-
-       while ((err = traverse_more(th)) == EAGAIN)
-               continue;
-
-       traverse_fini(th);
-       return (err);
-}
-
-int
-traverse_more(traverse_handle_t *th)
-{
-       zseg_t *zseg = list_head(&th->th_seglist);
-       uint64_t save_txg;      /* XXX won't be necessary with real itinerary */
-       krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
-       blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
-       int rc;
-
-       if (zseg == NULL)
+       if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
+           BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
                return (0);
 
-       th->th_restarts++;
+       mutex_enter(&pfd->pd_mtx);
+       while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
+               cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
+       pfd->pd_blks_fetched++;
+       cv_broadcast(&pfd->pd_cv);
+       mutex_exit(&pfd->pd_mtx);
 
-       save_txg = zseg->seg_mintxg;
+       (void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
+           ZIO_PRIORITY_ASYNC_READ,
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+           &aflags, zb);
 
-       rw_enter(rw, RW_READER);
-       th->th_locked = 1;
-
-       rc = traverse_segment(th, zseg, mosbp);
-       ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
-
-       if (th->th_locked)
-               rw_exit(rw);
-       th->th_locked = 0;
-
-       zseg->seg_mintxg = save_txg;
-
-       if (rc == ERANGE) {
-               list_remove(&th->th_seglist, zseg);
-               kmem_free(zseg, sizeof (*zseg));
-               return (EAGAIN);
-       }
-
-       return (rc);
+       return (0);
 }
 
-/*
- * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
- * are not included.  The blocks covered by this segment will all have
- * mintxg < birth < maxtxg.
- */
 static void
-traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
-    uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
+traverse_prefetch_thread(void *arg)
 {
-       zseg_t *zseg;
-
-       zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
-
-       zseg->seg_mintxg = mintxg;
-       zseg->seg_maxtxg = maxtxg;
+       struct traverse_data *td_main = arg;
+       struct traverse_data td = *td_main;
+       zbookmark_t czb;
 
-       zseg->seg_start.zb_objset = sobjset;
-       zseg->seg_start.zb_object = sobject;
-       zseg->seg_start.zb_level = slevel;
-       zseg->seg_start.zb_blkid = sblkid;
+       td.td_func = traverse_prefetcher;
+       td.td_arg = td_main->td_pfd;
+       td.td_pfd = NULL;
 
-       zseg->seg_end.zb_objset = eobjset;
-       zseg->seg_end.zb_object = eobject;
-       zseg->seg_end.zb_level = elevel;
-       zseg->seg_end.zb_blkid = eblkid;
+       SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0);
+       (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
 
-       list_insert_tail(&th->th_seglist, zseg);
+       mutex_enter(&td_main->td_pfd->pd_mtx);
+       td_main->td_pfd->pd_exited = B_TRUE;
+       cv_broadcast(&td_main->td_pfd->pd_cv);
+       mutex_exit(&td_main->td_pfd->pd_mtx);
 }
 
-void
-traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t objset, uint64_t object)
+/*
+ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
+ * in syncing context).
+ */
+static int
+traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
+    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
 {
-       if (th->th_advance & ADVANCE_PRE)
-               traverse_add_segment(th, mintxg, maxtxg,
-                   objset, object, ZB_MAXLEVEL, 0,
-                   objset, object, 0, ZB_MAXBLKID);
-       else
-               traverse_add_segment(th, mintxg, maxtxg,
-                   objset, object, 0, 0,
-                   objset, object, 0, ZB_MAXBLKID);
-}
+       struct traverse_data td;
+       struct prefetch_data pd = { 0 };
+       zbookmark_t czb;
+       int err;
 
-void
-traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t objset)
-{
-       if (th->th_advance & ADVANCE_PRE)
-               traverse_add_segment(th, mintxg, maxtxg,
-                   objset, 0, -1, 0,
-                   objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
-       else
-               traverse_add_segment(th, mintxg, maxtxg,
-                   objset, 1, 0, 0,
-                   objset, 0, -1, 0);
-}
+       td.td_spa = spa;
+       td.td_objset = objset;
+       td.td_rootbp = rootbp;
+       td.td_min_txg = txg_start;
+       td.td_func = func;
+       td.td_arg = arg;
+       td.td_pfd = &pd;
+       td.td_flags = flags;
+
+       pd.pd_blks_max = 100;
+       pd.pd_flags = flags;
+       mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
+       cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
+
+       if (!(flags & TRAVERSE_PREFETCH) ||
+           0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
+           &td, TQ_NOQUEUE))
+               pd.pd_exited = B_TRUE;
+
+       SET_BOOKMARK(&czb, objset, 0, -1, 0);
+       err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
+
+       mutex_enter(&pd.pd_mtx);
+       pd.pd_cancel = B_TRUE;
+       cv_broadcast(&pd.pd_cv);
+       while (!pd.pd_exited)
+               cv_wait(&pd.pd_cv, &pd.pd_mtx);
+       mutex_exit(&pd.pd_mtx);
+
+       mutex_destroy(&pd.pd_mtx);
+       cv_destroy(&pd.pd_cv);
 
-void
-traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
-{
-       if (th->th_advance & ADVANCE_PRE)
-               traverse_add_segment(th, mintxg, maxtxg,
-                   0, 0, -1, 0,
-                   ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
-       else
-               traverse_add_segment(th, mintxg, maxtxg,
-                   1, 1, 0, 0,
-                   0, 0, -1, 0);
+       return (err);
 }
 
-traverse_handle_t *
-traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
-    int zio_flags)
+/*
+ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
+ * in syncing context).
+ */
+int
+traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
+    blkptr_cb_t func, void *arg)
 {
-       traverse_handle_t *th;
-       int d, l;
-
-       th = kmem_zalloc(sizeof (*th), KM_SLEEP);
-
-       th->th_spa = spa;
-       th->th_func = func;
-       th->th_arg = arg;
-       th->th_advance = advance;
-       th->th_lastcb.zb_level = ZB_NO_LEVEL;
-       th->th_noread.zb_level = ZB_NO_LEVEL;
-       th->th_zio_flags = zio_flags;
-
-       list_create(&th->th_seglist, sizeof (zseg_t),
-           offsetof(zseg_t, seg_node));
-
-       for (d = 0; d < ZB_DEPTH; d++) {
-               for (l = 0; l < ZB_MAXLEVEL; l++) {
-                       if ((advance & ADVANCE_DATA) ||
-                           l != 0 || d != ZB_DN_CACHE)
-                               th->th_cache[d][l].bc_data =
-                                   zio_buf_alloc(SPA_MAXBLOCKSIZE);
-               }
-       }
-
-       return (th);
+       return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object,
+           &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
 }
 
-void
-traverse_fini(traverse_handle_t *th)
+/*
+ * NB: pool must not be changing on-disk (eg, from zdb or sync context).
+ */
+int
+traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg)
 {
-       int d, l;
-       zseg_t *zseg;
-
-       for (d = 0; d < ZB_DEPTH; d++)
-               for (l = 0; l < ZB_MAXLEVEL; l++)
-                       if (th->th_cache[d][l].bc_data != NULL)
-                               zio_buf_free(th->th_cache[d][l].bc_data,
-                                   SPA_MAXBLOCKSIZE);
-
-       while ((zseg = list_head(&th->th_seglist)) != NULL) {
-               list_remove(&th->th_seglist, zseg);
-               kmem_free(zseg, sizeof (*zseg));
+       int err;
+       uint64_t obj;
+       dsl_pool_t *dp = spa_get_dsl(spa);
+       objset_t *mos = dp->dp_meta_objset;
+
+       /* visit the MOS */
+       err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
+           0, TRAVERSE_PRE, func, arg);
+       if (err)
+               return (err);
+
+       /* visit each dataset */
+       for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) {
+               dmu_object_info_t doi;
+
+               err = dmu_object_info(mos, obj, &doi);
+               if (err)
+                       return (err);
+
+               if (doi.doi_type == DMU_OT_DSL_DATASET) {
+                       dsl_dataset_t *ds;
+                       rw_enter(&dp->dp_config_rwlock, RW_READER);
+                       err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
+                       rw_exit(&dp->dp_config_rwlock);
+                       if (err)
+                               return (err);
+                       err = traverse_dataset(ds,
+                           ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE,
+                           func, arg);
+                       dsl_dataset_rele(ds, FTAG);
+                       if (err)
+                               return (err);
+               }
        }
-
-       list_destroy(&th->th_seglist);
-
-       dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
-           th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
-           th->th_syncs, th->th_restarts);
-
-       kmem_free(th, sizeof (*th));
+       if (err == ESRCH)
+               err = 0;
+       return (err);
 }