4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/zfs_context.h>
27 #include <sys/dmu_objset.h>
28 #include <sys/dmu_traverse.h>
29 #include <sys/dsl_dataset.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/dsl_pool.h>
32 #include <sys/dnode.h>
35 #include <sys/dmu_impl.h>
36 #include <sys/callb.h>
38 #define SET_BOOKMARK(zb, objset, object, level, blkid) \
40 (zb)->zb_objset = objset; \
41 (zb)->zb_object = object; \
42 (zb)->zb_level = level; \
43 (zb)->zb_blkid = blkid; \
46 struct prefetch_data {
56 struct traverse_data {
62 struct prefetch_data *td_pfd;
69 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
71 struct traverse_data *td = arg;
74 if (bp->blk_birth == 0)
77 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
80 zb.zb_objset = td->td_objset;
83 zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
84 VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
89 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
91 struct traverse_data *td = arg;
93 if (lrc->lrc_txtype == TX_WRITE) {
94 lr_write_t *lr = (lr_write_t *)lrc;
95 blkptr_t *bp = &lr->lr_blkptr;
98 if (bp->blk_birth == 0)
101 if (claim_txg == 0 || bp->blk_birth < claim_txg)
104 zb.zb_objset = td->td_objset;
105 zb.zb_object = lr->lr_foid;
106 zb.zb_level = BP_GET_LEVEL(bp);
107 zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
108 VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
113 traverse_zil(struct traverse_data *td, zil_header_t *zh)
115 uint64_t claim_txg = zh->zh_claim_txg;
119 * We only want to visit blocks that have been claimed but not yet
120 * replayed (or, in read-only mode, blocks that *would* be claimed).
122 if (claim_txg == 0 && (spa_mode & FWRITE))
125 zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
127 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
134 traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
135 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
139 arc_buf_t *buf = NULL;
140 struct prefetch_data *pd = td->td_pfd;
142 if (bp->blk_birth == 0) {
143 err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg);
147 if (bp->blk_birth <= td->td_min_txg)
150 if (pd && !pd->pd_exited &&
151 ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
152 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
153 mutex_enter(&pd->pd_mtx);
154 ASSERT(pd->pd_blks_fetched >= 0);
155 while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
156 cv_wait(&pd->pd_cv, &pd->pd_mtx);
157 pd->pd_blks_fetched--;
158 cv_broadcast(&pd->pd_cv);
159 mutex_exit(&pd->pd_mtx);
162 if (td->td_flags & TRAVERSE_PRE) {
163 err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
168 if (BP_GET_LEVEL(bp) > 0) {
169 uint32_t flags = ARC_WAIT;
172 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
174 err = arc_read(NULL, td->td_spa, bp, pbuf,
175 arc_getbuf_func, &buf,
176 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
180 /* recursively visitbp() blocks below this */
182 for (i = 0; i < epb; i++, cbp++) {
183 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
185 zb->zb_blkid * epb + i);
186 err = traverse_visitbp(td, dnp, buf, cbp, &czb);
190 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
191 uint32_t flags = ARC_WAIT;
193 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
195 err = arc_read(NULL, td->td_spa, bp, pbuf,
196 arc_getbuf_func, &buf,
197 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
201 /* recursively visitbp() blocks below this */
203 for (i = 0; i < epb && err == 0; i++, dnp++) {
204 for (j = 0; j < dnp->dn_nblkptr; j++) {
205 SET_BOOKMARK(&czb, zb->zb_objset,
206 zb->zb_blkid * epb + i,
207 dnp->dn_nlevels - 1, j);
208 err = traverse_visitbp(td, dnp, buf,
209 (blkptr_t *)&dnp->dn_blkptr[j], &czb);
214 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
215 uint32_t flags = ARC_WAIT;
219 err = arc_read_nolock(NULL, td->td_spa, bp,
220 arc_getbuf_func, &buf,
221 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
227 * traverse_zil is just here for zdb's leak checking.
228 * For other consumers, there will be no ZIL blocks.
230 traverse_zil(td, &osp->os_zil_header);
232 for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
233 SET_BOOKMARK(&czb, zb->zb_objset, 0,
234 osp->os_meta_dnode.dn_nlevels - 1, j);
235 err = traverse_visitbp(td, &osp->os_meta_dnode, buf,
236 (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j],
244 (void) arc_buf_remove_ref(buf, &buf);
246 if (err == 0 && (td->td_flags & TRAVERSE_POST))
247 err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
254 traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
255 const dnode_phys_t *dnp, void *arg)
257 struct prefetch_data *pfd = arg;
258 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
260 ASSERT(pfd->pd_blks_fetched >= 0);
264 if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
265 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
268 mutex_enter(&pfd->pd_mtx);
269 while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
270 cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
271 pfd->pd_blks_fetched++;
272 cv_broadcast(&pfd->pd_cv);
273 mutex_exit(&pfd->pd_mtx);
275 (void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
276 ZIO_PRIORITY_ASYNC_READ,
277 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
284 traverse_prefetch_thread(void *arg)
286 struct traverse_data *td_main = arg;
287 struct traverse_data td = *td_main;
290 td.td_func = traverse_prefetcher;
291 td.td_arg = td_main->td_pfd;
294 SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0);
295 (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
297 mutex_enter(&td_main->td_pfd->pd_mtx);
298 td_main->td_pfd->pd_exited = B_TRUE;
299 cv_broadcast(&td_main->td_pfd->pd_cv);
300 mutex_exit(&td_main->td_pfd->pd_mtx);
304 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
305 * in syncing context).
308 traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
309 uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
311 struct traverse_data td;
312 struct prefetch_data pd = { 0 };
317 td.td_objset = objset;
318 td.td_rootbp = rootbp;
319 td.td_min_txg = txg_start;
325 pd.pd_blks_max = 100;
327 mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
328 cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
330 if (!(flags & TRAVERSE_PREFETCH) ||
331 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
333 pd.pd_exited = B_TRUE;
335 SET_BOOKMARK(&czb, objset, 0, -1, 0);
336 err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
338 mutex_enter(&pd.pd_mtx);
339 pd.pd_cancel = B_TRUE;
340 cv_broadcast(&pd.pd_cv);
341 while (!pd.pd_exited)
342 cv_wait(&pd.pd_cv, &pd.pd_mtx);
343 mutex_exit(&pd.pd_mtx);
345 mutex_destroy(&pd.pd_mtx);
346 cv_destroy(&pd.pd_cv);
352 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
353 * in syncing context).
356 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
357 blkptr_cb_t func, void *arg)
359 return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object,
360 &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
364 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
367 traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg)
371 dsl_pool_t *dp = spa_get_dsl(spa);
372 objset_t *mos = dp->dp_meta_objset;
375 err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
376 0, TRAVERSE_PRE, func, arg);
380 /* visit each dataset */
381 for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) {
382 dmu_object_info_t doi;
384 err = dmu_object_info(mos, obj, &doi);
388 if (doi.doi_type == DMU_OT_DSL_DATASET) {
390 rw_enter(&dp->dp_config_rwlock, RW_READER);
391 err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
392 rw_exit(&dp->dp_config_rwlock);
395 err = traverse_dataset(ds,
396 ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE,
398 dsl_dataset_rele(ds, FTAG);