Fix zfs_txg_timeout module parameter
[zfs.git] / module / zfs / dsl_scan.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24
25 #include <sys/dsl_scan.h>
26 #include <sys/dsl_pool.h>
27 #include <sys/dsl_dataset.h>
28 #include <sys/dsl_prop.h>
29 #include <sys/dsl_dir.h>
30 #include <sys/dsl_synctask.h>
31 #include <sys/dnode.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/dmu_objset.h>
34 #include <sys/arc.h>
35 #include <sys/zap.h>
36 #include <sys/zio.h>
37 #include <sys/zfs_context.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/zfs_znode.h>
40 #include <sys/spa_impl.h>
41 #include <sys/vdev_impl.h>
42 #include <sys/zil_impl.h>
43 #include <sys/zio_checksum.h>
44 #include <sys/ddt.h>
45 #include <sys/sa.h>
46 #include <sys/sa_impl.h>
47 #ifdef _KERNEL
48 #include <sys/zfs_vfsops.h>
49 #endif
50
51 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
52
53 static scan_cb_t dsl_scan_scrub_cb;
54 static dsl_syncfunc_t dsl_scan_cancel_sync;
55 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
56
57 int zfs_top_maxinflight = 32;           /* maximum I/Os per top-level */
58 int zfs_resilver_delay = 2;             /* number of ticks to delay resilver */
59 int zfs_scrub_delay = 4;                /* number of ticks to delay scrub */
60 int zfs_scan_idle = 50;                 /* idle window in clock ticks */
61
62 int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
63 int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
64 int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
65 int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
66 int zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
67 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
68 int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
69
70 #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
71         ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
72         (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
73
74 /* the order has to match pool_scan_type */
75 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
76         NULL,
77         dsl_scan_scrub_cb,      /* POOL_SCAN_SCRUB */
78         dsl_scan_scrub_cb,      /* POOL_SCAN_RESILVER */
79 };
80
81 int
82 dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
83 {
84         int err;
85         dsl_scan_t *scn;
86         spa_t *spa = dp->dp_spa;
87         uint64_t f;
88
89         scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
90         scn->scn_dp = dp;
91
92         err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
93             "scrub_func", sizeof (uint64_t), 1, &f);
94         if (err == 0) {
95                 /*
96                  * There was an old-style scrub in progress.  Restart a
97                  * new-style scrub from the beginning.
98                  */
99                 scn->scn_restart_txg = txg;
100                 zfs_dbgmsg("old-style scrub was in progress; "
101                     "restarting new-style scrub in txg %llu",
102                     scn->scn_restart_txg);
103
104                 /*
105                  * Load the queue obj from the old location so that it
106                  * can be freed by dsl_scan_done().
107                  */
108                 (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
109                     "scrub_queue", sizeof (uint64_t), 1,
110                     &scn->scn_phys.scn_queue_obj);
111         } else {
112                 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
113                     DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
114                     &scn->scn_phys);
115                 if (err == ENOENT)
116                         return (0);
117                 else if (err)
118                         return (err);
119
120                 if (scn->scn_phys.scn_state == DSS_SCANNING &&
121                     spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
122                         /*
123                          * A new-type scrub was in progress on an old
124                          * pool, and the pool was accessed by old
125                          * software.  Restart from the beginning, since
126                          * the old software may have changed the pool in
127                          * the meantime.
128                          */
129                         scn->scn_restart_txg = txg;
130                         zfs_dbgmsg("new-style scrub was modified "
131                             "by old software; restarting in txg %llu",
132                             scn->scn_restart_txg);
133                 }
134         }
135
136         spa_scan_stat_init(spa);
137         return (0);
138 }
139
140 void
141 dsl_scan_fini(dsl_pool_t *dp)
142 {
143         if (dp->dp_scan) {
144                 kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
145                 dp->dp_scan = NULL;
146         }
147 }
148
149 /* ARGSUSED */
150 static int
151 dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
152 {
153         dsl_scan_t *scn = arg1;
154
155         if (scn->scn_phys.scn_state == DSS_SCANNING)
156                 return (EBUSY);
157
158         return (0);
159 }
160
161 /* ARGSUSED */
162 static void
163 dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx)
164 {
165         dsl_scan_t *scn = arg1;
166         pool_scan_func_t *funcp = arg2;
167         dmu_object_type_t ot = 0;
168         dsl_pool_t *dp = scn->scn_dp;
169         spa_t *spa = dp->dp_spa;
170
171         ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
172         ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
173         bzero(&scn->scn_phys, sizeof (scn->scn_phys));
174         scn->scn_phys.scn_func = *funcp;
175         scn->scn_phys.scn_state = DSS_SCANNING;
176         scn->scn_phys.scn_min_txg = 0;
177         scn->scn_phys.scn_max_txg = tx->tx_txg;
178         scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
179         scn->scn_phys.scn_start_time = gethrestime_sec();
180         scn->scn_phys.scn_errors = 0;
181         scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
182         scn->scn_restart_txg = 0;
183         spa_scan_stat_init(spa);
184
185         if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
186                 scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
187
188                 /* rewrite all disk labels */
189                 vdev_config_dirty(spa->spa_root_vdev);
190
191                 if (vdev_resilver_needed(spa->spa_root_vdev,
192                     &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
193                         spa_event_notify(spa, NULL, FM_EREPORT_ZFS_RESILVER_START);
194                 } else {
195                         spa_event_notify(spa, NULL, FM_EREPORT_ZFS_SCRUB_START);
196                 }
197
198                 spa->spa_scrub_started = B_TRUE;
199                 /*
200                  * If this is an incremental scrub, limit the DDT scrub phase
201                  * to just the auto-ditto class (for correctness); the rest
202                  * of the scrub should go faster using top-down pruning.
203                  */
204                 if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
205                         scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
206
207         }
208
209         /* back to the generic stuff */
210
211         if (dp->dp_blkstats == NULL) {
212                 dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t),
213                     KM_PUSHPAGE | KM_NODEBUG);
214         }
215         bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
216
217         if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
218                 ot = DMU_OT_ZAP_OTHER;
219
220         scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
221             ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
222
223         dsl_scan_sync_state(scn, tx);
224
225         spa_history_log_internal(LOG_POOL_SCAN, spa, tx,
226             "func=%u mintxg=%llu maxtxg=%llu",
227             *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
228 }
229
230 /* ARGSUSED */
231 static void
232 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
233 {
234         static const char *old_names[] = {
235                 "scrub_bookmark",
236                 "scrub_ddt_bookmark",
237                 "scrub_ddt_class_max",
238                 "scrub_queue",
239                 "scrub_min_txg",
240                 "scrub_max_txg",
241                 "scrub_func",
242                 "scrub_errors",
243                 NULL
244         };
245
246         dsl_pool_t *dp = scn->scn_dp;
247         spa_t *spa = dp->dp_spa;
248         int i;
249
250         /* Remove any remnants of an old-style scrub. */
251         for (i = 0; old_names[i]; i++) {
252                 (void) zap_remove(dp->dp_meta_objset,
253                     DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
254         }
255
256         if (scn->scn_phys.scn_queue_obj != 0) {
257                 VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
258                     scn->scn_phys.scn_queue_obj, tx));
259                 scn->scn_phys.scn_queue_obj = 0;
260         }
261
262         /*
263          * If we were "restarted" from a stopped state, don't bother
264          * with anything else.
265          */
266         if (scn->scn_phys.scn_state != DSS_SCANNING)
267                 return;
268
269         if (complete)
270                 scn->scn_phys.scn_state = DSS_FINISHED;
271         else
272                 scn->scn_phys.scn_state = DSS_CANCELED;
273
274         spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx,
275             "complete=%u", complete);
276
277         if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
278                 mutex_enter(&spa->spa_scrub_lock);
279                 while (spa->spa_scrub_inflight > 0) {
280                         cv_wait(&spa->spa_scrub_io_cv,
281                             &spa->spa_scrub_lock);
282                 }
283                 mutex_exit(&spa->spa_scrub_lock);
284                 spa->spa_scrub_started = B_FALSE;
285                 spa->spa_scrub_active = B_FALSE;
286
287                 /*
288                  * If the scrub/resilver completed, update all DTLs to
289                  * reflect this.  Whether it succeeded or not, vacate
290                  * all temporary scrub DTLs.
291                  */
292                 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
293                     complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
294                 if (complete) {
295                         spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
296                             FM_EREPORT_ZFS_RESILVER_FINISH :
297                             FM_EREPORT_ZFS_SCRUB_FINISH);
298                 }
299                 spa_errlog_rotate(spa);
300
301                 /*
302                  * We may have finished replacing a device.
303                  * Let the async thread assess this and handle the detach.
304                  */
305                 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
306         }
307
308         scn->scn_phys.scn_end_time = gethrestime_sec();
309 }
310
311 /* ARGSUSED */
312 static int
313 dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
314 {
315         dsl_scan_t *scn = arg1;
316
317         if (scn->scn_phys.scn_state != DSS_SCANNING)
318                 return (ENOENT);
319         return (0);
320 }
321
322 /* ARGSUSED */
323 static void
324 dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
325 {
326         dsl_scan_t *scn = arg1;
327
328         dsl_scan_done(scn, B_FALSE, tx);
329         dsl_scan_sync_state(scn, tx);
330 }
331
332 int
333 dsl_scan_cancel(dsl_pool_t *dp)
334 {
335         boolean_t complete = B_FALSE;
336         int err;
337
338         err = dsl_sync_task_do(dp, dsl_scan_cancel_check,
339             dsl_scan_cancel_sync, dp->dp_scan, &complete, 3);
340         return (err);
341 }
342
343 static void dsl_scan_visitbp(blkptr_t *bp,
344     const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
345     dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
346     dmu_tx_t *tx);
347 inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
348     dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
349     dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
350
351 void
352 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
353 {
354         zio_free(dp->dp_spa, txg, bp);
355 }
356
357 void
358 dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
359 {
360         ASSERT(dsl_pool_sync_context(dp));
361         zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
362 }
363
364 int
365 dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
366     arc_done_func_t *done, void *private, int priority, int zio_flags,
367     uint32_t *arc_flags, const zbookmark_t *zb)
368 {
369         return (arc_read(pio, spa, bpp, pbuf, done, private,
370             priority, zio_flags, arc_flags, zb));
371 }
372
373 int
374 dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
375     arc_done_func_t *done, void *private, int priority, int zio_flags,
376     uint32_t *arc_flags, const zbookmark_t *zb)
377 {
378         return (arc_read_nolock(pio, spa, bpp, done, private,
379             priority, zio_flags, arc_flags, zb));
380 }
381
382 static boolean_t
383 bookmark_is_zero(const zbookmark_t *zb)
384 {
385         return (zb->zb_objset == 0 && zb->zb_object == 0 &&
386             zb->zb_level == 0 && zb->zb_blkid == 0);
387 }
388
389 /* dnp is the dnode for zb1->zb_object */
390 static boolean_t
391 bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
392     const zbookmark_t *zb2)
393 {
394         uint64_t zb1nextL0, zb2thisobj;
395
396         ASSERT(zb1->zb_objset == zb2->zb_objset);
397         ASSERT(zb2->zb_level == 0);
398
399         /*
400          * A bookmark in the deadlist is considered to be after
401          * everything else.
402          */
403         if (zb2->zb_object == DMU_DEADLIST_OBJECT)
404                 return (B_TRUE);
405
406         /* The objset_phys_t isn't before anything. */
407         if (dnp == NULL)
408                 return (B_FALSE);
409
410         zb1nextL0 = (zb1->zb_blkid + 1) <<
411             ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
412
413         zb2thisobj = zb2->zb_object ? zb2->zb_object :
414             zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
415
416         if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
417                 uint64_t nextobj = zb1nextL0 *
418                     (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
419                 return (nextobj <= zb2thisobj);
420         }
421
422         if (zb1->zb_object < zb2thisobj)
423                 return (B_TRUE);
424         if (zb1->zb_object > zb2thisobj)
425                 return (B_FALSE);
426         if (zb2->zb_object == DMU_META_DNODE_OBJECT)
427                 return (B_FALSE);
428         return (zb1nextL0 <= zb2->zb_blkid);
429 }
430
431 static uint64_t
432 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
433 {
434         uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
435         if (dsl_dataset_is_snapshot(ds))
436                 return (MIN(smt, ds->ds_phys->ds_creation_txg));
437         return (smt);
438 }
439
440 static void
441 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
442 {
443         VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
444             DMU_POOL_DIRECTORY_OBJECT,
445             DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
446             &scn->scn_phys, tx));
447 }
448
449 static boolean_t
450 dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
451 {
452         uint64_t elapsed_nanosecs;
453         int mintime;
454
455         /* we never skip user/group accounting objects */
456         if (zb && (int64_t)zb->zb_object < 0)
457                 return (B_FALSE);
458
459         if (scn->scn_pausing)
460                 return (B_TRUE); /* we're already pausing */
461
462         if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
463                 return (B_FALSE); /* we're resuming */
464
465         /* We only know how to resume from level-0 blocks. */
466         if (zb && zb->zb_level != 0)
467                 return (B_FALSE);
468
469         mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
470             zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
471         elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
472         if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
473             (elapsed_nanosecs / MICROSEC > mintime &&
474             txg_sync_waiting(scn->scn_dp)) ||
475             spa_shutting_down(scn->scn_dp->dp_spa)) {
476                 if (zb) {
477                         dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
478                             (longlong_t)zb->zb_objset,
479                             (longlong_t)zb->zb_object,
480                             (longlong_t)zb->zb_level,
481                             (longlong_t)zb->zb_blkid);
482                         scn->scn_phys.scn_bookmark = *zb;
483                 }
484                 dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
485                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
486                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
487                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
488                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
489                 scn->scn_pausing = B_TRUE;
490                 return (B_TRUE);
491         }
492         return (B_FALSE);
493 }
494
495 typedef struct zil_scan_arg {
496         dsl_pool_t      *zsa_dp;
497         zil_header_t    *zsa_zh;
498 } zil_scan_arg_t;
499
500 /* ARGSUSED */
501 static int
502 dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
503 {
504         zil_scan_arg_t *zsa = arg;
505         dsl_pool_t *dp = zsa->zsa_dp;
506         dsl_scan_t *scn = dp->dp_scan;
507         zil_header_t *zh = zsa->zsa_zh;
508         zbookmark_t zb;
509
510         if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
511                 return (0);
512
513         /*
514          * One block ("stubby") can be allocated a long time ago; we
515          * want to visit that one because it has been allocated
516          * (on-disk) even if it hasn't been claimed (even though for
517          * scrub there's nothing to do to it).
518          */
519         if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
520                 return (0);
521
522         SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
523             ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
524
525         VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
526         return (0);
527 }
528
529 /* ARGSUSED */
530 static int
531 dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
532 {
533         if (lrc->lrc_txtype == TX_WRITE) {
534                 zil_scan_arg_t *zsa = arg;
535                 dsl_pool_t *dp = zsa->zsa_dp;
536                 dsl_scan_t *scn = dp->dp_scan;
537                 zil_header_t *zh = zsa->zsa_zh;
538                 lr_write_t *lr = (lr_write_t *)lrc;
539                 blkptr_t *bp = &lr->lr_blkptr;
540                 zbookmark_t zb;
541
542                 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
543                         return (0);
544
545                 /*
546                  * birth can be < claim_txg if this record's txg is
547                  * already txg sync'ed (but this log block contains
548                  * other records that are not synced)
549                  */
550                 if (claim_txg == 0 || bp->blk_birth < claim_txg)
551                         return (0);
552
553                 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
554                     lr->lr_foid, ZB_ZIL_LEVEL,
555                     lr->lr_offset / BP_GET_LSIZE(bp));
556
557                 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
558         }
559         return (0);
560 }
561
562 static void
563 dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
564 {
565         uint64_t claim_txg = zh->zh_claim_txg;
566         zil_scan_arg_t zsa = { dp, zh };
567         zilog_t *zilog;
568
569         /*
570          * We only want to visit blocks that have been claimed but not yet
571          * replayed (or, in read-only mode, blocks that *would* be claimed).
572          */
573         if (claim_txg == 0 && spa_writeable(dp->dp_spa))
574                 return;
575
576         zilog = zil_alloc(dp->dp_meta_objset, zh);
577
578         (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
579             claim_txg);
580
581         zil_free(zilog);
582 }
583
584 /* ARGSUSED */
585 static void
586 dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
587     uint64_t objset, uint64_t object, uint64_t blkid)
588 {
589         zbookmark_t czb;
590         uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
591
592         if (zfs_no_scrub_prefetch)
593                 return;
594
595         if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
596             (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
597                 return;
598
599         SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
600
601         /*
602          * XXX need to make sure all of these arc_read() prefetches are
603          * done before setting xlateall (similar to dsl_read())
604          */
605         (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
606             buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
607             ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
608 }
609
610 static boolean_t
611 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
612     const zbookmark_t *zb)
613 {
614         /*
615          * We never skip over user/group accounting objects (obj<0)
616          */
617         if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
618             (int64_t)zb->zb_object >= 0) {
619                 /*
620                  * If we already visited this bp & everything below (in
621                  * a prior txg sync), don't bother doing it again.
622                  */
623                 if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
624                         return (B_TRUE);
625
626                 /*
627                  * If we found the block we're trying to resume from, or
628                  * we went past it to a different object, zero it out to
629                  * indicate that it's OK to start checking for pausing
630                  * again.
631                  */
632                 if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
633                     zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
634                         dprintf("resuming at %llx/%llx/%llx/%llx\n",
635                             (longlong_t)zb->zb_objset,
636                             (longlong_t)zb->zb_object,
637                             (longlong_t)zb->zb_level,
638                             (longlong_t)zb->zb_blkid);
639                         bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
640                 }
641         }
642         return (B_FALSE);
643 }
644
645 /*
646  * Return nonzero on i/o error.
647  * Return new buf to write out in *bufp.
648  */
649 inline __attribute__((always_inline)) static int
650 dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
651     dnode_phys_t *dnp, const blkptr_t *bp,
652     const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
653 {
654         dsl_pool_t *dp = scn->scn_dp;
655         int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
656         int err;
657
658         if (BP_GET_LEVEL(bp) > 0) {
659                 uint32_t flags = ARC_WAIT;
660                 int i;
661                 blkptr_t *cbp;
662                 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
663
664                 err = arc_read_nolock(NULL, dp->dp_spa, bp,
665                     arc_getbuf_func, bufp,
666                     ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
667                 if (err) {
668                         scn->scn_phys.scn_errors++;
669                         return (err);
670                 }
671                 for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
672                         dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
673                             zb->zb_object, zb->zb_blkid * epb + i);
674                 }
675                 for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
676                         zbookmark_t czb;
677
678                         SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
679                             zb->zb_level - 1,
680                             zb->zb_blkid * epb + i);
681                         dsl_scan_visitbp(cbp, &czb, dnp,
682                             *bufp, ds, scn, ostype, tx);
683                 }
684         } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
685                 uint32_t flags = ARC_WAIT;
686
687                 err = arc_read_nolock(NULL, dp->dp_spa, bp,
688                     arc_getbuf_func, bufp,
689                     ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
690                 if (err) {
691                         scn->scn_phys.scn_errors++;
692                         return (err);
693                 }
694         } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
695                 uint32_t flags = ARC_WAIT;
696                 dnode_phys_t *cdnp;
697                 int i, j;
698                 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
699
700                 err = arc_read_nolock(NULL, dp->dp_spa, bp,
701                     arc_getbuf_func, bufp,
702                     ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
703                 if (err) {
704                         scn->scn_phys.scn_errors++;
705                         return (err);
706                 }
707                 for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
708                         for (j = 0; j < cdnp->dn_nblkptr; j++) {
709                                 blkptr_t *cbp = &cdnp->dn_blkptr[j];
710                                 dsl_scan_prefetch(scn, *bufp, cbp,
711                                     zb->zb_objset, zb->zb_blkid * epb + i, j);
712                         }
713                 }
714                 for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
715                         dsl_scan_visitdnode(scn, ds, ostype,
716                             cdnp, *bufp, zb->zb_blkid * epb + i, tx);
717                 }
718
719         } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
720                 uint32_t flags = ARC_WAIT;
721                 objset_phys_t *osp;
722
723                 err = arc_read_nolock(NULL, dp->dp_spa, bp,
724                     arc_getbuf_func, bufp,
725                     ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
726                 if (err) {
727                         scn->scn_phys.scn_errors++;
728                         return (err);
729                 }
730
731                 osp = (*bufp)->b_data;
732
733                 dsl_scan_visitdnode(scn, ds, osp->os_type,
734                     &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
735
736                 if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
737                         /*
738                          * We also always visit user/group accounting
739                          * objects, and never skip them, even if we are
740                          * pausing.  This is necessary so that the space
741                          * deltas from this txg get integrated.
742                          */
743                         dsl_scan_visitdnode(scn, ds, osp->os_type,
744                             &osp->os_groupused_dnode, *bufp,
745                             DMU_GROUPUSED_OBJECT, tx);
746                         dsl_scan_visitdnode(scn, ds, osp->os_type,
747                             &osp->os_userused_dnode, *bufp,
748                             DMU_USERUSED_OBJECT, tx);
749                 }
750         }
751
752         return (0);
753 }
754
755 inline __attribute__((always_inline)) static void
756 dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
757     dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
758     uint64_t object, dmu_tx_t *tx)
759 {
760         int j;
761
762         for (j = 0; j < dnp->dn_nblkptr; j++) {
763                 zbookmark_t czb;
764
765                 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
766                     dnp->dn_nlevels - 1, j);
767                 dsl_scan_visitbp(&dnp->dn_blkptr[j],
768                     &czb, dnp, buf, ds, scn, ostype, tx);
769         }
770
771         if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
772                 zbookmark_t czb;
773                 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
774                     0, DMU_SPILL_BLKID);
775                 dsl_scan_visitbp(&dnp->dn_spill,
776                     &czb, dnp, buf, ds, scn, ostype, tx);
777         }
778 }
779
780 /*
781  * The arguments are in this order because mdb can only print the
782  * first 5; we want them to be useful.
783  */
784 static void
785 dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
786     dnode_phys_t *dnp, arc_buf_t *pbuf,
787     dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
788     dmu_tx_t *tx)
789 {
790         dsl_pool_t *dp = scn->scn_dp;
791         arc_buf_t *buf = NULL;
792         blkptr_t *bp_toread;
793
794         bp_toread = kmem_alloc(sizeof (blkptr_t), KM_PUSHPAGE);
795         *bp_toread = *bp;
796
797         /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
798
799         if (dsl_scan_check_pause(scn, zb))
800                 goto out;
801
802         if (dsl_scan_check_resume(scn, dnp, zb))
803                 goto out;
804
805         if (bp->blk_birth == 0)
806                 goto out;
807
808         scn->scn_visited_this_txg++;
809
810         /*
811          * This debugging is commented out to conserve stack space.  This
812          * function is called recursively and the debugging addes several
813          * bytes to the stack for each call.  It can be commented back in
814          * if required to debug an issue in dsl_scan_visitbp().
815          *
816          * dprintf_bp(bp,
817          *    "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
818          *    ds, ds ? ds->ds_object : 0,
819          *    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
820          *    pbuf, bp);
821          */
822
823         if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
824                 goto out;
825
826         if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
827                 /*
828                  * For non-user-accounting blocks, we need to read the
829                  * new bp (from a deleted snapshot, found in
830                  * check_existing_xlation).  If we used the old bp,
831                  * pointers inside this block from before we resumed
832                  * would be untranslated.
833                  *
834                  * For user-accounting blocks, we need to read the old
835                  * bp, because we will apply the entire space delta to
836                  * it (original untranslated -> translations from
837                  * deleted snap -> now).
838                  */
839                 *bp_toread = *bp;
840         }
841
842         if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx,
843             &buf) != 0)
844                 goto out;
845
846         /*
847          * If dsl_scan_ddt() has aready visited this block, it will have
848          * already done any translations or scrubbing, so don't call the
849          * callback again.
850          */
851         if (ddt_class_contains(dp->dp_spa,
852             scn->scn_phys.scn_ddt_class_max, bp)) {
853                 ASSERT(buf == NULL);
854                 goto out;
855         }
856
857         /*
858          * If this block is from the future (after cur_max_txg), then we
859          * are doing this on behalf of a deleted snapshot, and we will
860          * revisit the future block on the next pass of this dataset.
861          * Don't scan it now unless we need to because something
862          * under it was modified.
863          */
864         if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
865                 scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
866         }
867         if (buf)
868                 (void) arc_buf_remove_ref(buf, &buf);
869 out:
870         kmem_free(bp_toread, sizeof(blkptr_t));
871 }
872
873 static void
874 dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
875     dmu_tx_t *tx)
876 {
877         zbookmark_t zb;
878
879         SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
880             ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
881         dsl_scan_visitbp(bp, &zb, NULL, NULL,
882             ds, scn, DMU_OST_NONE, tx);
883
884         dprintf_ds(ds, "finished scan%s", "");
885 }
886
887 void
888 dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
889 {
890         dsl_pool_t *dp = ds->ds_dir->dd_pool;
891         dsl_scan_t *scn = dp->dp_scan;
892         uint64_t mintxg;
893
894         if (scn->scn_phys.scn_state != DSS_SCANNING)
895                 return;
896
897         if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
898                 if (dsl_dataset_is_snapshot(ds)) {
899                         /* Note, scn_cur_{min,max}_txg stays the same. */
900                         scn->scn_phys.scn_bookmark.zb_objset =
901                             ds->ds_phys->ds_next_snap_obj;
902                         zfs_dbgmsg("destroying ds %llu; currently traversing; "
903                             "reset zb_objset to %llu",
904                             (u_longlong_t)ds->ds_object,
905                             (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
906                         scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
907                 } else {
908                         SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
909                             ZB_DESTROYED_OBJSET, 0, 0, 0);
910                         zfs_dbgmsg("destroying ds %llu; currently traversing; "
911                             "reset bookmark to -1,0,0,0",
912                             (u_longlong_t)ds->ds_object);
913                 }
914         } else if (zap_lookup_int_key(dp->dp_meta_objset,
915             scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
916                 ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
917                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
918                     scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
919                 if (dsl_dataset_is_snapshot(ds)) {
920                         /*
921                          * We keep the same mintxg; it could be >
922                          * ds_creation_txg if the previous snapshot was
923                          * deleted too.
924                          */
925                         VERIFY(zap_add_int_key(dp->dp_meta_objset,
926                             scn->scn_phys.scn_queue_obj,
927                             ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
928                         zfs_dbgmsg("destroying ds %llu; in queue; "
929                             "replacing with %llu",
930                             (u_longlong_t)ds->ds_object,
931                             (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
932                 } else {
933                         zfs_dbgmsg("destroying ds %llu; in queue; removing",
934                             (u_longlong_t)ds->ds_object);
935                 }
936         } else {
937                 zfs_dbgmsg("destroying ds %llu; ignoring",
938                     (u_longlong_t)ds->ds_object);
939         }
940
941         /*
942          * dsl_scan_sync() should be called after this, and should sync
943          * out our changed state, but just to be safe, do it here.
944          */
945         dsl_scan_sync_state(scn, tx);
946 }
947
948 void
949 dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
950 {
951         dsl_pool_t *dp = ds->ds_dir->dd_pool;
952         dsl_scan_t *scn = dp->dp_scan;
953         uint64_t mintxg;
954
955         if (scn->scn_phys.scn_state != DSS_SCANNING)
956                 return;
957
958         ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
959
960         if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
961                 scn->scn_phys.scn_bookmark.zb_objset =
962                     ds->ds_phys->ds_prev_snap_obj;
963                 zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
964                     "reset zb_objset to %llu",
965                     (u_longlong_t)ds->ds_object,
966                     (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
967         } else if (zap_lookup_int_key(dp->dp_meta_objset,
968             scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
969                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
970                     scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
971                 VERIFY(zap_add_int_key(dp->dp_meta_objset,
972                     scn->scn_phys.scn_queue_obj,
973                     ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
974                 zfs_dbgmsg("snapshotting ds %llu; in queue; "
975                     "replacing with %llu",
976                     (u_longlong_t)ds->ds_object,
977                     (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
978         }
979         dsl_scan_sync_state(scn, tx);
980 }
981
982 void
983 dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
984 {
985         dsl_pool_t *dp = ds1->ds_dir->dd_pool;
986         dsl_scan_t *scn = dp->dp_scan;
987         uint64_t mintxg;
988
989         if (scn->scn_phys.scn_state != DSS_SCANNING)
990                 return;
991
992         if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
993                 scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
994                 zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
995                     "reset zb_objset to %llu",
996                     (u_longlong_t)ds1->ds_object,
997                     (u_longlong_t)ds2->ds_object);
998         } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
999                 scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
1000                 zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
1001                     "reset zb_objset to %llu",
1002                     (u_longlong_t)ds2->ds_object,
1003                     (u_longlong_t)ds1->ds_object);
1004         }
1005
1006         if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
1007             ds1->ds_object, &mintxg) == 0) {
1008                 int err;
1009
1010                 ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
1011                 ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
1012                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
1013                     scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
1014                 err = zap_add_int_key(dp->dp_meta_objset,
1015                     scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
1016                 VERIFY(err == 0 || err == EEXIST);
1017                 if (err == EEXIST) {
1018                         /* Both were there to begin with */
1019                         VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
1020                             scn->scn_phys.scn_queue_obj,
1021                             ds1->ds_object, mintxg, tx));
1022                 }
1023                 zfs_dbgmsg("clone_swap ds %llu; in queue; "
1024                     "replacing with %llu",
1025                     (u_longlong_t)ds1->ds_object,
1026                     (u_longlong_t)ds2->ds_object);
1027         } else if (zap_lookup_int_key(dp->dp_meta_objset,
1028             scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
1029                 ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
1030                 ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
1031                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
1032                     scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
1033                 VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
1034                     scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
1035                 zfs_dbgmsg("clone_swap ds %llu; in queue; "
1036                     "replacing with %llu",
1037                     (u_longlong_t)ds2->ds_object,
1038                     (u_longlong_t)ds1->ds_object);
1039         }
1040
1041         dsl_scan_sync_state(scn, tx);
1042 }
1043
1044 struct enqueue_clones_arg {
1045         dmu_tx_t *tx;
1046         uint64_t originobj;
1047 };
1048
1049 /* ARGSUSED */
1050 static int
1051 enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
1052 {
1053         struct enqueue_clones_arg *eca = arg;
1054         dsl_dataset_t *ds;
1055         int err;
1056         dsl_pool_t *dp = spa->spa_dsl_pool;
1057         dsl_scan_t *scn = dp->dp_scan;
1058
1059         err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
1060         if (err)
1061                 return (err);
1062
1063         if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
1064                 while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
1065                         dsl_dataset_t *prev;
1066                         err = dsl_dataset_hold_obj(dp,
1067                             ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
1068
1069                         dsl_dataset_rele(ds, FTAG);
1070                         if (err)
1071                                 return (err);
1072                         ds = prev;
1073                 }
1074                 VERIFY(zap_add_int_key(dp->dp_meta_objset,
1075                     scn->scn_phys.scn_queue_obj, ds->ds_object,
1076                     ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
1077         }
1078         dsl_dataset_rele(ds, FTAG);
1079         return (0);
1080 }
1081
1082 static void
1083 dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
1084 {
1085         dsl_pool_t *dp = scn->scn_dp;
1086         dsl_dataset_t *ds;
1087         objset_t *os;
1088         char *dsname;
1089
1090         VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
1091
1092         if (dmu_objset_from_ds(ds, &os))
1093                 goto out;
1094
1095         /*
1096          * Only the ZIL in the head (non-snapshot) is valid.  Even though
1097          * snapshots can have ZIL block pointers (which may be the same
1098          * BP as in the head), they must be ignored.  So we traverse the
1099          * ZIL here, rather than in scan_recurse(), because the regular
1100          * snapshot block-sharing rules don't apply to it.
1101          */
1102         if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
1103                 dsl_scan_zil(dp, &os->os_zil_header);
1104
1105         /*
1106          * Iterate over the bps in this ds.
1107          */
1108         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1109         dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
1110
1111         dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_PUSHPAGE);
1112         dsl_dataset_name(ds, dsname);
1113         zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
1114             "pausing=%u",
1115             (longlong_t)dsobj, dsname,
1116             (longlong_t)scn->scn_phys.scn_cur_min_txg,
1117             (longlong_t)scn->scn_phys.scn_cur_max_txg,
1118             (int)scn->scn_pausing);
1119         kmem_free(dsname, ZFS_MAXNAMELEN);
1120
1121         if (scn->scn_pausing)
1122                 goto out;
1123
1124         /*
1125          * We've finished this pass over this dataset.
1126          */
1127
1128         /*
1129          * If we did not completely visit this dataset, do another pass.
1130          */
1131         if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
1132                 zfs_dbgmsg("incomplete pass; visiting again");
1133                 scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
1134                 VERIFY(zap_add_int_key(dp->dp_meta_objset,
1135                     scn->scn_phys.scn_queue_obj, ds->ds_object,
1136                     scn->scn_phys.scn_cur_max_txg, tx) == 0);
1137                 goto out;
1138         }
1139
1140         /*
1141          * Add descendent datasets to work queue.
1142          */
1143         if (ds->ds_phys->ds_next_snap_obj != 0) {
1144                 VERIFY(zap_add_int_key(dp->dp_meta_objset,
1145                     scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
1146                     ds->ds_phys->ds_creation_txg, tx) == 0);
1147         }
1148         if (ds->ds_phys->ds_num_children > 1) {
1149                 boolean_t usenext = B_FALSE;
1150                 if (ds->ds_phys->ds_next_clones_obj != 0) {
1151                         uint64_t count;
1152                         /*
1153                          * A bug in a previous version of the code could
1154                          * cause upgrade_clones_cb() to not set
1155                          * ds_next_snap_obj when it should, leading to a
1156                          * missing entry.  Therefore we can only use the
1157                          * next_clones_obj when its count is correct.
1158                          */
1159                         int err = zap_count(dp->dp_meta_objset,
1160                             ds->ds_phys->ds_next_clones_obj, &count);
1161                         if (err == 0 &&
1162                             count == ds->ds_phys->ds_num_children - 1)
1163                                 usenext = B_TRUE;
1164                 }
1165
1166                 if (usenext) {
1167                         VERIFY(zap_join_key(dp->dp_meta_objset,
1168                             ds->ds_phys->ds_next_clones_obj,
1169                             scn->scn_phys.scn_queue_obj,
1170                             ds->ds_phys->ds_creation_txg, tx) == 0);
1171                 } else {
1172                         struct enqueue_clones_arg eca;
1173                         eca.tx = tx;
1174                         eca.originobj = ds->ds_object;
1175
1176                         (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
1177                             NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
1178                 }
1179         }
1180
1181 out:
1182         dsl_dataset_rele(ds, FTAG);
1183 }
1184
1185 /* ARGSUSED */
1186 static int
1187 enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
1188 {
1189         dmu_tx_t *tx = arg;
1190         dsl_dataset_t *ds;
1191         int err;
1192         dsl_pool_t *dp = spa->spa_dsl_pool;
1193         dsl_scan_t *scn = dp->dp_scan;
1194
1195         err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
1196         if (err)
1197                 return (err);
1198
1199         while (ds->ds_phys->ds_prev_snap_obj != 0) {
1200                 dsl_dataset_t *prev;
1201                 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
1202                     FTAG, &prev);
1203                 if (err) {
1204                         dsl_dataset_rele(ds, FTAG);
1205                         return (err);
1206                 }
1207
1208                 /*
1209                  * If this is a clone, we don't need to worry about it for now.
1210                  */
1211                 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
1212                         dsl_dataset_rele(ds, FTAG);
1213                         dsl_dataset_rele(prev, FTAG);
1214                         return (0);
1215                 }
1216                 dsl_dataset_rele(ds, FTAG);
1217                 ds = prev;
1218         }
1219
1220         VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
1221             ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
1222         dsl_dataset_rele(ds, FTAG);
1223         return (0);
1224 }
1225
1226 /*
1227  * Scrub/dedup interaction.
1228  *
1229  * If there are N references to a deduped block, we don't want to scrub it
1230  * N times -- ideally, we should scrub it exactly once.
1231  *
1232  * We leverage the fact that the dde's replication class (enum ddt_class)
1233  * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
1234  * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
1235  *
1236  * To prevent excess scrubbing, the scrub begins by walking the DDT
1237  * to find all blocks with refcnt > 1, and scrubs each of these once.
1238  * Since there are two replication classes which contain blocks with
1239  * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
1240  * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
1241  *
1242  * There would be nothing more to say if a block's refcnt couldn't change
1243  * during a scrub, but of course it can so we must account for changes
1244  * in a block's replication class.
1245  *
1246  * Here's an example of what can occur:
1247  *
1248  * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
1249  * when visited during the top-down scrub phase, it will be scrubbed twice.
1250  * This negates our scrub optimization, but is otherwise harmless.
1251  *
1252  * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
1253  * on each visit during the top-down scrub phase, it will never be scrubbed.
1254  * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
1255  * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
1256  * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
1257  * while a scrub is in progress, it scrubs the block right then.
1258  */
1259 static void
1260 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
1261 {
1262         ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
1263         ddt_entry_t dde;
1264         int error;
1265         uint64_t n = 0;
1266
1267         bzero(&dde, sizeof (ddt_entry_t));
1268
1269         while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
1270                 ddt_t *ddt;
1271
1272                 if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
1273                         break;
1274                 dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
1275                     (longlong_t)ddb->ddb_class,
1276                     (longlong_t)ddb->ddb_type,
1277                     (longlong_t)ddb->ddb_checksum,
1278                     (longlong_t)ddb->ddb_cursor);
1279
1280                 /* There should be no pending changes to the dedup table */
1281                 ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
1282                 ASSERT(avl_first(&ddt->ddt_tree) == NULL);
1283
1284                 dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
1285                 n++;
1286
1287                 if (dsl_scan_check_pause(scn, NULL))
1288                         break;
1289         }
1290
1291         zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
1292             (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
1293             (int)scn->scn_pausing);
1294
1295         ASSERT(error == 0 || error == ENOENT);
1296         ASSERT(error != ENOENT ||
1297             ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
1298 }
1299
1300 /* ARGSUSED */
1301 void
1302 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
1303     ddt_entry_t *dde, dmu_tx_t *tx)
1304 {
1305         const ddt_key_t *ddk = &dde->dde_key;
1306         ddt_phys_t *ddp = dde->dde_phys;
1307         blkptr_t bp;
1308         zbookmark_t zb = { 0 };
1309         int p;
1310
1311         if (scn->scn_phys.scn_state != DSS_SCANNING)
1312                 return;
1313
1314         for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1315                 if (ddp->ddp_phys_birth == 0 ||
1316                     ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
1317                         continue;
1318                 ddt_bp_create(checksum, ddk, ddp, &bp);
1319
1320                 scn->scn_visited_this_txg++;
1321                 scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
1322         }
1323 }
1324
1325 static void
1326 dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
1327 {
1328         dsl_pool_t *dp = scn->scn_dp;
1329         zap_cursor_t *zc;
1330         zap_attribute_t *za;
1331
1332         if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1333             scn->scn_phys.scn_ddt_class_max) {
1334                 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
1335                 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
1336                 dsl_scan_ddt(scn, tx);
1337                 if (scn->scn_pausing)
1338                         return;
1339         }
1340
1341         if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
1342                 /* First do the MOS & ORIGIN */
1343
1344                 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
1345                 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
1346                 dsl_scan_visit_rootbp(scn, NULL,
1347                     &dp->dp_meta_rootbp, tx);
1348                 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
1349                 if (scn->scn_pausing)
1350                         return;
1351
1352                 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
1353                         VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
1354                             NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
1355                 } else {
1356                         dsl_scan_visitds(scn,
1357                             dp->dp_origin_snap->ds_object, tx);
1358                 }
1359                 ASSERT(!scn->scn_pausing);
1360         } else if (scn->scn_phys.scn_bookmark.zb_objset !=
1361             ZB_DESTROYED_OBJSET) {
1362                 /*
1363                  * If we were paused, continue from here.  Note if the
1364                  * ds we were paused on was deleted, the zb_objset may
1365                  * be -1, so we will skip this and find a new objset
1366                  * below.
1367                  */
1368                 dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
1369                 if (scn->scn_pausing)
1370                         return;
1371         }
1372
1373         /*
1374          * In case we were paused right at the end of the ds, zero the
1375          * bookmark so we don't think that we're still trying to resume.
1376          */
1377         bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
1378         zc = kmem_alloc(sizeof(zap_cursor_t), KM_PUSHPAGE);
1379         za = kmem_alloc(sizeof(zap_attribute_t), KM_PUSHPAGE);
1380
1381         /* keep pulling things out of the zap-object-as-queue */
1382         while (zap_cursor_init(zc, dp->dp_meta_objset,
1383             scn->scn_phys.scn_queue_obj),
1384             zap_cursor_retrieve(zc, za) == 0) {
1385                 dsl_dataset_t *ds;
1386                 uint64_t dsobj;
1387
1388                 dsobj = strtonum(za->za_name, NULL);
1389                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
1390                     scn->scn_phys.scn_queue_obj, dsobj, tx));
1391
1392                 /* Set up min/max txg */
1393                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
1394                 if (za->za_first_integer != 0) {
1395                         scn->scn_phys.scn_cur_min_txg =
1396                             MAX(scn->scn_phys.scn_min_txg,
1397                             za->za_first_integer);
1398                 } else {
1399                         scn->scn_phys.scn_cur_min_txg =
1400                             MAX(scn->scn_phys.scn_min_txg,
1401                             ds->ds_phys->ds_prev_snap_txg);
1402                 }
1403                 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
1404                 dsl_dataset_rele(ds, FTAG);
1405
1406                 dsl_scan_visitds(scn, dsobj, tx);
1407                 zap_cursor_fini(zc);
1408                 if (scn->scn_pausing)
1409                         goto out;
1410         }
1411         zap_cursor_fini(zc);
1412 out:
1413         kmem_free(za, sizeof(zap_attribute_t));
1414         kmem_free(zc, sizeof(zap_cursor_t));
1415 }
1416
1417 static int
1418 dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1419 {
1420         dsl_scan_t *scn = arg;
1421         uint64_t elapsed_nanosecs;
1422
1423         elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
1424
1425         if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
1426             (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
1427             txg_sync_waiting(scn->scn_dp)) ||
1428             spa_shutting_down(scn->scn_dp->dp_spa))
1429                 return (ERESTART);
1430
1431         zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
1432             dmu_tx_get_txg(tx), bp, 0));
1433         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
1434             -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
1435             -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
1436         scn->scn_visited_this_txg++;
1437         return (0);
1438 }
1439
1440 boolean_t
1441 dsl_scan_active(dsl_scan_t *scn)
1442 {
1443         spa_t *spa = scn->scn_dp->dp_spa;
1444         uint64_t used = 0, comp, uncomp;
1445
1446         if (spa->spa_load_state != SPA_LOAD_NONE)
1447                 return (B_FALSE);
1448         if (spa_shutting_down(spa))
1449                 return (B_FALSE);
1450
1451         if (scn->scn_phys.scn_state == DSS_SCANNING)
1452                 return (B_TRUE);
1453
1454         if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1455                 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
1456                     &used, &comp, &uncomp);
1457         }
1458         return (used != 0);
1459 }
1460
1461 void
1462 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
1463 {
1464         dsl_scan_t *scn = dp->dp_scan;
1465         spa_t *spa = dp->dp_spa;
1466         int err;
1467
1468         /*
1469          * Check for scn_restart_txg before checking spa_load_state, so
1470          * that we can restart an old-style scan while the pool is being
1471          * imported (see dsl_scan_init).
1472          */
1473         if (scn->scn_restart_txg != 0 &&
1474             scn->scn_restart_txg <= tx->tx_txg) {
1475                 pool_scan_func_t func = POOL_SCAN_SCRUB;
1476                 dsl_scan_done(scn, B_FALSE, tx);
1477                 if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
1478                         func = POOL_SCAN_RESILVER;
1479                 zfs_dbgmsg("restarting scan func=%u txg=%llu",
1480                     func, tx->tx_txg);
1481                 dsl_scan_setup_sync(scn, &func, tx);
1482         }
1483
1484         if (!dsl_scan_active(scn) ||
1485             spa_sync_pass(dp->dp_spa) > 1)
1486                 return;
1487
1488         scn->scn_visited_this_txg = 0;
1489         scn->scn_pausing = B_FALSE;
1490         scn->scn_sync_start_time = gethrtime();
1491         spa->spa_scrub_active = B_TRUE;
1492
1493         /*
1494          * First process the free list.  If we pause the free, don't do
1495          * any scanning.  This ensures that there is no free list when
1496          * we are scanning, so the scan code doesn't have to worry about
1497          * traversing it.
1498          */
1499         if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1500                 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1501                     NULL, ZIO_FLAG_MUSTSUCCEED);
1502                 err = bpobj_iterate(&dp->dp_free_bpobj,
1503                     dsl_scan_free_cb, scn, tx);
1504                 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
1505                 if (scn->scn_visited_this_txg) {
1506                         zfs_dbgmsg("freed %llu blocks in %llums from "
1507                             "free_bpobj txg %llu",
1508                             (longlong_t)scn->scn_visited_this_txg,
1509                             (longlong_t)
1510                             (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
1511                             (longlong_t)tx->tx_txg);
1512                         scn->scn_visited_this_txg = 0;
1513                         /*
1514                          * Re-sync the ddt so that we can further modify
1515                          * it when doing bprewrite.
1516                          */
1517                         ddt_sync(spa, tx->tx_txg);
1518                 }
1519                 if (err == ERESTART)
1520                         return;
1521         }
1522
1523         if (scn->scn_phys.scn_state != DSS_SCANNING)
1524                 return;
1525
1526         if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1527             scn->scn_phys.scn_ddt_class_max) {
1528                 zfs_dbgmsg("doing scan sync txg %llu; "
1529                     "ddt bm=%llu/%llu/%llu/%llx",
1530                     (longlong_t)tx->tx_txg,
1531                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
1532                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
1533                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
1534                     (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
1535                 ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
1536                 ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
1537                 ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
1538                 ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
1539         } else {
1540                 zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
1541                     (longlong_t)tx->tx_txg,
1542                     (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
1543                     (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
1544                     (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
1545                     (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
1546         }
1547
1548         scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1549             NULL, ZIO_FLAG_CANFAIL);
1550         dsl_scan_visit(scn, tx);
1551         (void) zio_wait(scn->scn_zio_root);
1552         scn->scn_zio_root = NULL;
1553
1554         zfs_dbgmsg("visited %llu blocks in %llums",
1555             (longlong_t)scn->scn_visited_this_txg,
1556             (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
1557
1558         if (!scn->scn_pausing) {
1559                 /* finished with scan. */
1560                 zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
1561                 dsl_scan_done(scn, B_TRUE, tx);
1562         }
1563
1564         if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
1565                 mutex_enter(&spa->spa_scrub_lock);
1566                 while (spa->spa_scrub_inflight > 0) {
1567                         cv_wait(&spa->spa_scrub_io_cv,
1568                             &spa->spa_scrub_lock);
1569                 }
1570                 mutex_exit(&spa->spa_scrub_lock);
1571         }
1572
1573         dsl_scan_sync_state(scn, tx);
1574 }
1575
1576 /*
1577  * This will start a new scan, or restart an existing one.
1578  */
1579 void
1580 dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
1581 {
1582         if (txg == 0) {
1583                 dmu_tx_t *tx;
1584                 tx = dmu_tx_create_dd(dp->dp_mos_dir);
1585                 VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
1586
1587                 txg = dmu_tx_get_txg(tx);
1588                 dp->dp_scan->scn_restart_txg = txg;
1589                 dmu_tx_commit(tx);
1590         } else {
1591                 dp->dp_scan->scn_restart_txg = txg;
1592         }
1593         zfs_dbgmsg("restarting resilver txg=%llu", txg);
1594 }
1595
1596 boolean_t
1597 dsl_scan_resilvering(dsl_pool_t *dp)
1598 {
1599         return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
1600             dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
1601 }
1602
1603 /*
1604  * scrub consumers
1605  */
1606
1607 static void
1608 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
1609 {
1610         int i;
1611
1612         /*
1613          * If we resume after a reboot, zab will be NULL; don't record
1614          * incomplete stats in that case.
1615          */
1616         if (zab == NULL)
1617                 return;
1618
1619         for (i = 0; i < 4; i++) {
1620                 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
1621                 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
1622                 zfs_blkstat_t *zb = &zab->zab_type[l][t];
1623                 int equal;
1624
1625                 zb->zb_count++;
1626                 zb->zb_asize += BP_GET_ASIZE(bp);
1627                 zb->zb_lsize += BP_GET_LSIZE(bp);
1628                 zb->zb_psize += BP_GET_PSIZE(bp);
1629                 zb->zb_gangs += BP_COUNT_GANG(bp);
1630
1631                 switch (BP_GET_NDVAS(bp)) {
1632                 case 2:
1633                         if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1634                             DVA_GET_VDEV(&bp->blk_dva[1]))
1635                                 zb->zb_ditto_2_of_2_samevdev++;
1636                         break;
1637                 case 3:
1638                         equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1639                             DVA_GET_VDEV(&bp->blk_dva[1])) +
1640                             (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1641                             DVA_GET_VDEV(&bp->blk_dva[2])) +
1642                             (DVA_GET_VDEV(&bp->blk_dva[1]) ==
1643                             DVA_GET_VDEV(&bp->blk_dva[2]));
1644                         if (equal == 1)
1645                                 zb->zb_ditto_2_of_3_samevdev++;
1646                         else if (equal == 3)
1647                                 zb->zb_ditto_3_of_3_samevdev++;
1648                         break;
1649                 }
1650         }
1651 }
1652
1653 static void
1654 dsl_scan_scrub_done(zio_t *zio)
1655 {
1656         spa_t *spa = zio->io_spa;
1657
1658         zio_data_buf_free(zio->io_data, zio->io_size);
1659
1660         mutex_enter(&spa->spa_scrub_lock);
1661         spa->spa_scrub_inflight--;
1662         cv_broadcast(&spa->spa_scrub_io_cv);
1663
1664         if (zio->io_error && (zio->io_error != ECKSUM ||
1665             !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
1666                 spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
1667         }
1668         mutex_exit(&spa->spa_scrub_lock);
1669 }
1670
1671 static int
1672 dsl_scan_scrub_cb(dsl_pool_t *dp,
1673     const blkptr_t *bp, const zbookmark_t *zb)
1674 {
1675         dsl_scan_t *scn = dp->dp_scan;
1676         size_t size = BP_GET_PSIZE(bp);
1677         spa_t *spa = dp->dp_spa;
1678         uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
1679         boolean_t needs_io = B_FALSE;
1680         int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
1681         int zio_priority = 0;
1682         int scan_delay = 0;
1683         int d;
1684
1685         if (phys_birth <= scn->scn_phys.scn_min_txg ||
1686             phys_birth >= scn->scn_phys.scn_max_txg)
1687                 return (0);
1688
1689         count_block(dp->dp_blkstats, bp);
1690
1691         ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
1692         if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
1693                 zio_flags |= ZIO_FLAG_SCRUB;
1694                 zio_priority = ZIO_PRIORITY_SCRUB;
1695                 needs_io = B_TRUE;
1696                 scan_delay = zfs_scrub_delay;
1697         } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
1698                 zio_flags |= ZIO_FLAG_RESILVER;
1699                 zio_priority = ZIO_PRIORITY_RESILVER;
1700                 needs_io = B_FALSE;
1701                 scan_delay = zfs_resilver_delay;
1702         }
1703
1704         /* If it's an intent log block, failure is expected. */
1705         if (zb->zb_level == ZB_ZIL_LEVEL)
1706                 zio_flags |= ZIO_FLAG_SPECULATIVE;
1707
1708         for (d = 0; d < BP_GET_NDVAS(bp); d++) {
1709                 vdev_t *vd = vdev_lookup_top(spa,
1710                     DVA_GET_VDEV(&bp->blk_dva[d]));
1711
1712                 /*
1713                  * Keep track of how much data we've examined so that
1714                  * zpool(1M) status can make useful progress reports.
1715                  */
1716                 scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
1717                 spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
1718
1719                 /* if it's a resilver, this may not be in the target range */
1720                 if (!needs_io) {
1721                         if (DVA_GET_GANG(&bp->blk_dva[d])) {
1722                                 /*
1723                                  * Gang members may be spread across multiple
1724                                  * vdevs, so the best estimate we have is the
1725                                  * scrub range, which has already been checked.
1726                                  * XXX -- it would be better to change our
1727                                  * allocation policy to ensure that all
1728                                  * gang members reside on the same vdev.
1729                                  */
1730                                 needs_io = B_TRUE;
1731                         } else {
1732                                 needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
1733                                     phys_birth, 1);
1734                         }
1735                 }
1736         }
1737
1738         if (needs_io && !zfs_no_scrub_io) {
1739                 vdev_t *rvd = spa->spa_root_vdev;
1740                 uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
1741                 void *data = zio_data_buf_alloc(size);
1742
1743                 mutex_enter(&spa->spa_scrub_lock);
1744                 while (spa->spa_scrub_inflight >= maxinflight)
1745                         cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
1746                 spa->spa_scrub_inflight++;
1747                 mutex_exit(&spa->spa_scrub_lock);
1748
1749                 /*
1750                  * If we're seeing recent (zfs_scan_idle) "important" I/Os
1751                  * then throttle our workload to limit the impact of a scan.
1752                  */
1753                 if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
1754                         delay(scan_delay);
1755
1756                 zio_nowait(zio_read(NULL, spa, bp, data, size,
1757                     dsl_scan_scrub_done, NULL, zio_priority,
1758                     zio_flags, zb));
1759         }
1760
1761         /* do not relocate this block */
1762         return (0);
1763 }
1764
1765 int
1766 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
1767 {
1768         spa_t *spa = dp->dp_spa;
1769
1770         /*
1771          * Purge all vdev caches and probe all devices.  We do this here
1772          * rather than in sync context because this requires a writer lock
1773          * on the spa_config lock, which we can't do from sync context.  The
1774          * spa_scrub_reopen flag indicates that vdev_open() should not
1775          * attempt to start another scrub.
1776          */
1777         spa_vdev_state_enter(spa, SCL_NONE);
1778         spa->spa_scrub_reopen = B_TRUE;
1779         vdev_reopen(spa->spa_root_vdev);
1780         spa->spa_scrub_reopen = B_FALSE;
1781         (void) spa_vdev_state_exit(spa, NULL, 0);
1782
1783         return (dsl_sync_task_do(dp, dsl_scan_setup_check,
1784             dsl_scan_setup_sync, dp->dp_scan, &func, 0));
1785 }
1786
1787 #if defined(_KERNEL) && defined(HAVE_SPL)
1788 module_param(zfs_top_maxinflight, int, 0644);
1789 MODULE_PARM_DESC(zfs_top_maxinflight, "Max I/Os per top-level");
1790
1791 module_param(zfs_resilver_delay, int, 0644);
1792 MODULE_PARM_DESC(zfs_resilver_delay, "Number of ticks to delay resilver");
1793
1794 module_param(zfs_scrub_delay, int, 0644);
1795 MODULE_PARM_DESC(zfs_scrub_delay, "Number of ticks to delay scrub");
1796
1797 module_param(zfs_scan_idle, int, 0644);
1798 MODULE_PARM_DESC(zfs_scan_idle, "Idle window in clock ticks");
1799
1800 module_param(zfs_scan_min_time_ms, int, 0644);
1801 MODULE_PARM_DESC(zfs_scan_min_time_ms, "Min millisecs to scrub per txg");
1802
1803 module_param(zfs_free_min_time_ms, int, 0644);
1804 MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg");
1805
1806 module_param(zfs_resilver_min_time_ms, int, 0644);
1807 MODULE_PARM_DESC(zfs_resilver_min_time_ms, "Min millisecs to resilver per txg");
1808
1809 module_param(zfs_no_scrub_io, int, 0644);
1810 MODULE_PARM_DESC(zfs_no_scrub_io, "Set to disable scrub I/O");
1811
1812 module_param(zfs_no_scrub_prefetch, int, 0644);
1813 MODULE_PARM_DESC(zfs_no_scrub_prefetch, "Set to disable scrub prefetching");
1814 #endif