4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 #include <sys/zfs_context.h>
26 #include <sys/dmu_objset.h>
27 #include <sys/dmu_traverse.h>
28 #include <sys/dsl_dataset.h>
29 #include <sys/dsl_dir.h>
30 #include <sys/dsl_pool.h>
31 #include <sys/dnode.h>
34 #include <sys/dmu_impl.h>
36 #include <sys/sa_impl.h>
37 #include <sys/callb.h>
39 int zfs_pd_blks_max = 100;
41 typedef struct prefetch_data {
51 typedef struct traverse_data {
57 prefetch_data_t *td_pfd;
62 typedef struct traverse_visitbp_data {
63 /* Function arguments */
64 traverse_data_t *tv_td;
65 const dnode_phys_t *tv_dnp;
68 const zbookmark_t *tv_zb;
70 prefetch_data_t *tv_pd;
74 objset_phys_t *tv_osp;
75 dnode_phys_t *tv_ldnp;
83 } traverse_visitbp_data_t;
85 static inline int traverse_visitbp(traverse_data_t *td, const
86 dnode_phys_t *dnp, arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb);
87 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
88 arc_buf_t *buf, uint64_t objset, uint64_t object);
91 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
93 traverse_data_t *td = arg;
96 if (bp->blk_birth == 0)
99 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
102 SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
103 bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
105 (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg);
111 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
113 traverse_data_t *td = arg;
115 if (lrc->lrc_txtype == TX_WRITE) {
116 lr_write_t *lr = (lr_write_t *)lrc;
117 blkptr_t *bp = &lr->lr_blkptr;
120 if (bp->blk_birth == 0)
123 if (claim_txg == 0 || bp->blk_birth < claim_txg)
126 SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
127 ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
129 (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
136 traverse_zil(traverse_data_t *td, zil_header_t *zh)
138 uint64_t claim_txg = zh->zh_claim_txg;
142 * We only want to visit blocks that have been claimed but not yet
143 * replayed; plus, in read-only mode, blocks that are already stable.
145 if (claim_txg == 0 && spa_writeable(td->td_spa))
148 zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
150 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
156 #define TRAVERSE_VISITBP_MAX_DEPTH 20
159 __traverse_visitbp_init(traverse_visitbp_data_t *tv,
160 traverse_data_t *td, const dnode_phys_t *dnp,
161 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb, int depth)
171 tv->tv_pd = td->td_pfd;
172 tv->tv_hard = td->td_flags & TRAVERSE_HARD;
173 tv->tv_flags = ARC_WAIT;
174 tv->tv_depth = depth;
178 __traverse_visitbp(traverse_visitbp_data_t *tv)
180 ASSERT3S(tv->tv_depth, <, TRAVERSE_VISITBP_MAX_DEPTH);
182 if (tv->tv_bp->blk_birth == 0) {
183 tv->tv_err = tv->tv_td->td_func(tv->tv_td->td_spa, NULL, NULL,
184 tv->tv_pbuf, tv->tv_zb, tv->tv_dnp, tv->tv_td->td_arg);
188 if (tv->tv_bp->blk_birth <= tv->tv_td->td_min_txg)
191 if (tv->tv_pd && !tv->tv_pd->pd_exited &&
192 ((tv->tv_pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
193 BP_GET_TYPE(tv->tv_bp) == DMU_OT_DNODE ||
194 BP_GET_LEVEL(tv->tv_bp) > 0)) {
195 mutex_enter(&tv->tv_pd->pd_mtx);
196 ASSERT(tv->tv_pd->pd_blks_fetched >= 0);
197 while (tv->tv_pd->pd_blks_fetched == 0 && !tv->tv_pd->pd_exited)
198 cv_wait(&tv->tv_pd->pd_cv, &tv->tv_pd->pd_mtx);
199 tv->tv_pd->pd_blks_fetched--;
200 cv_broadcast(&tv->tv_pd->pd_cv);
201 mutex_exit(&tv->tv_pd->pd_mtx);
204 if (tv->tv_td->td_flags & TRAVERSE_PRE) {
205 tv->tv_err = tv->tv_td->td_func(tv->tv_td->td_spa, NULL,
206 tv->tv_bp, tv->tv_pbuf, tv->tv_zb, tv->tv_dnp,
208 if (tv->tv_err == TRAVERSE_VISIT_NO_CHILDREN)
214 if (BP_GET_LEVEL(tv->tv_bp) > 0) {
215 tv->tv_epb = BP_GET_LSIZE(tv->tv_bp) >> SPA_BLKPTRSHIFT;
217 tv->tv_err = dsl_read(NULL, tv->tv_td->td_spa, tv->tv_bp,
218 tv->tv_pbuf, arc_getbuf_func, &tv->tv_buf,
219 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
220 &tv->tv_flags, tv->tv_zb);
224 /* recursively visitbp() blocks below this */
225 tv->tv_cbp = tv->tv_buf->b_data;
226 for (tv->tv_i = 0; tv->tv_i < tv->tv_epb;
227 tv->tv_i++, tv->tv_cbp++) {
228 SET_BOOKMARK(&tv->tv_czb, tv->tv_zb->zb_objset,
229 tv->tv_zb->zb_object, tv->tv_zb->zb_level - 1,
230 tv->tv_zb->zb_blkid * tv->tv_epb + tv->tv_i);
231 __traverse_visitbp_init(tv + 1, tv->tv_td,
232 tv->tv_dnp, tv->tv_buf, tv->tv_cbp,
233 &tv->tv_czb, tv->tv_depth + 1);
234 tv->tv_err = __traverse_visitbp(tv + 1);
238 tv->tv_lasterr = tv->tv_err;
241 } else if (BP_GET_TYPE(tv->tv_bp) == DMU_OT_DNODE) {
242 tv->tv_epb = BP_GET_LSIZE(tv->tv_bp) >> DNODE_SHIFT;
244 tv->tv_err = dsl_read(NULL, tv->tv_td->td_spa, tv->tv_bp,
245 tv->tv_pbuf, arc_getbuf_func, &tv->tv_buf,
246 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
247 &tv->tv_flags, tv->tv_zb);
251 /* recursively visitbp() blocks below this */
252 tv->tv_dnp = tv->tv_buf->b_data;
253 for (tv->tv_i = 0; tv->tv_i < tv->tv_epb;
254 tv->tv_i++, tv->tv_dnp++) {
255 tv->tv_err = traverse_dnode(tv->tv_td, tv->tv_dnp,
256 tv->tv_buf, tv->tv_zb->zb_objset,
257 tv->tv_zb->zb_blkid * tv->tv_epb + tv->tv_i);
261 tv->tv_lasterr = tv->tv_err;
264 } else if (BP_GET_TYPE(tv->tv_bp) == DMU_OT_OBJSET) {
266 tv->tv_err = dsl_read_nolock(NULL, tv->tv_td->td_spa,
267 tv->tv_bp, arc_getbuf_func, &tv->tv_buf,
268 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
269 &tv->tv_flags, tv->tv_zb);
273 tv->tv_osp = tv->tv_buf->b_data;
274 tv->tv_ldnp = &tv->tv_osp->os_meta_dnode;
275 tv->tv_err = traverse_dnode(tv->tv_td, tv->tv_ldnp, tv->tv_buf,
276 tv->tv_zb->zb_objset, DMU_META_DNODE_OBJECT);
277 if (tv->tv_err && tv->tv_hard) {
278 tv->tv_lasterr = tv->tv_err;
281 if (tv->tv_err == 0 &&
282 arc_buf_size(tv->tv_buf) >= sizeof (objset_phys_t)) {
283 tv->tv_ldnp = &tv->tv_osp->os_userused_dnode;
284 tv->tv_err = traverse_dnode(tv->tv_td, tv->tv_ldnp,
285 tv->tv_buf, tv->tv_zb->zb_objset,
286 DMU_USERUSED_OBJECT);
288 if (tv->tv_err && tv->tv_hard) {
289 tv->tv_lasterr = tv->tv_err;
292 if (tv->tv_err == 0 &&
293 arc_buf_size(tv->tv_buf) >= sizeof (objset_phys_t)) {
294 tv->tv_ldnp = &tv->tv_osp->os_groupused_dnode;
295 tv->tv_err = traverse_dnode(tv->tv_td, tv->tv_ldnp,
296 tv->tv_buf, tv->tv_zb->zb_objset,
297 DMU_GROUPUSED_OBJECT);
302 (void) arc_buf_remove_ref(tv->tv_buf, &tv->tv_buf);
304 if (tv->tv_err == 0 && tv->tv_lasterr == 0 &&
305 (tv->tv_td->td_flags & TRAVERSE_POST)) {
306 tv->tv_err = tv->tv_td->td_func(tv->tv_td->td_spa, NULL,
307 tv->tv_bp, tv->tv_pbuf, tv->tv_zb, tv->tv_dnp,
311 return (tv->tv_err != 0 ? tv->tv_err : tv->tv_lasterr);
315 * Due to limited stack space recursive functions are frowned upon in
316 * the Linux kernel. However, they often are the most elegant solution
317 * to a problem. The following code preserves the recursive function
318 * traverse_visitbp() but moves the local variables AND function
319 * arguments to the heap to minimize the stack frame size. Enough
320 * space is initially allocated on the stack for 16 levels of recursion.
321 * This change does ugly-up-the-code but it reduces the worst case
322 * usage from roughly 2496 bytes to 576 bytes on x86_64 archs.
325 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
326 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
328 traverse_visitbp_data_t *tv;
331 tv = kmem_zalloc(sizeof(traverse_visitbp_data_t) *
332 TRAVERSE_VISITBP_MAX_DEPTH, KM_SLEEP);
333 __traverse_visitbp_init(tv, td, dnp, pbuf, bp, zb, 0);
335 error = __traverse_visitbp(tv);
337 kmem_free(tv, sizeof(traverse_visitbp_data_t) *
338 TRAVERSE_VISITBP_MAX_DEPTH);
344 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
345 arc_buf_t *buf, uint64_t objset, uint64_t object)
347 int j, err = 0, lasterr = 0;
349 boolean_t hard = (td->td_flags & TRAVERSE_HARD);
351 for (j = 0; j < dnp->dn_nblkptr; j++) {
352 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
353 err = traverse_visitbp(td, dnp, buf,
354 (blkptr_t *)&dnp->dn_blkptr[j], &czb);
362 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
363 SET_BOOKMARK(&czb, objset,
364 object, 0, DMU_SPILL_BLKID);
365 err = traverse_visitbp(td, dnp, buf,
366 (blkptr_t *)&dnp->dn_spill, &czb);
373 return (err != 0 ? err : lasterr);
378 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
379 arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
382 prefetch_data_t *pfd = arg;
383 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
385 ASSERT(pfd->pd_blks_fetched >= 0);
389 if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
390 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
391 BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
394 mutex_enter(&pfd->pd_mtx);
395 while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
396 cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
397 pfd->pd_blks_fetched++;
398 cv_broadcast(&pfd->pd_cv);
399 mutex_exit(&pfd->pd_mtx);
401 (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL,
402 ZIO_PRIORITY_ASYNC_READ,
403 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
410 traverse_prefetch_thread(void *arg)
412 traverse_data_t *td_main = arg;
413 traverse_data_t td = *td_main;
416 td.td_func = traverse_prefetcher;
417 td.td_arg = td_main->td_pfd;
420 SET_BOOKMARK(&czb, td.td_objset,
421 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
422 (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
424 mutex_enter(&td_main->td_pfd->pd_mtx);
425 td_main->td_pfd->pd_exited = B_TRUE;
426 cv_broadcast(&td_main->td_pfd->pd_cv);
427 mutex_exit(&td_main->td_pfd->pd_mtx);
431 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
432 * in syncing context).
435 traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp,
436 uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
443 td = kmem_alloc(sizeof(traverse_data_t), KM_SLEEP);
444 pd = kmem_zalloc(sizeof(prefetch_data_t), KM_SLEEP);
445 czb = kmem_alloc(sizeof(zbookmark_t), KM_SLEEP);
448 td->td_objset = ds ? ds->ds_object : 0;
449 td->td_rootbp = rootbp;
450 td->td_min_txg = txg_start;
454 td->td_flags = flags;
456 pd->pd_blks_max = zfs_pd_blks_max;
457 pd->pd_flags = flags;
458 mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
459 cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
461 /* See comment on ZIL traversal in dsl_scan_visitds. */
462 if (ds != NULL && !dsl_dataset_is_snapshot(ds)) {
465 err = dmu_objset_from_ds(ds, &os);
469 traverse_zil(td, &os->os_zil_header);
472 if (!(flags & TRAVERSE_PREFETCH) ||
473 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
475 pd->pd_exited = B_TRUE;
477 SET_BOOKMARK(czb, td->td_objset,
478 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
479 err = traverse_visitbp(td, NULL, NULL, rootbp, czb);
481 mutex_enter(&pd->pd_mtx);
482 pd->pd_cancel = B_TRUE;
483 cv_broadcast(&pd->pd_cv);
484 while (!pd->pd_exited)
485 cv_wait(&pd->pd_cv, &pd->pd_mtx);
486 mutex_exit(&pd->pd_mtx);
488 mutex_destroy(&pd->pd_mtx);
489 cv_destroy(&pd->pd_cv);
491 kmem_free(czb, sizeof(zbookmark_t));
492 kmem_free(pd, sizeof(struct prefetch_data));
493 kmem_free(td, sizeof(struct traverse_data));
499 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
500 * in syncing context).
503 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
504 blkptr_cb_t func, void *arg)
506 return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds,
507 &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
511 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
514 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
515 blkptr_cb_t func, void *arg)
517 int err, lasterr = 0;
519 dsl_pool_t *dp = spa_get_dsl(spa);
520 objset_t *mos = dp->dp_meta_objset;
521 boolean_t hard = (flags & TRAVERSE_HARD);
524 err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa),
525 txg_start, flags, func, arg);
529 /* visit each dataset */
530 for (obj = 1; err == 0 || (err != ESRCH && hard);
531 err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
532 dmu_object_info_t doi;
534 err = dmu_object_info(mos, obj, &doi);
542 if (doi.doi_type == DMU_OT_DSL_DATASET) {
544 uint64_t txg = txg_start;
546 rw_enter(&dp->dp_config_rwlock, RW_READER);
547 err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
548 rw_exit(&dp->dp_config_rwlock);
555 if (ds->ds_phys->ds_prev_snap_txg > txg)
556 txg = ds->ds_phys->ds_prev_snap_txg;
557 err = traverse_dataset(ds, txg, flags, func, arg);
558 dsl_dataset_rele(ds, FTAG);
568 return (err != 0 ? err : lasterr);
571 #if defined(_KERNEL) && defined(HAVE_SPL)
572 EXPORT_SYMBOL(traverse_dataset);
573 EXPORT_SYMBOL(traverse_pool);