Add AUTHORS to master branch
[zfs.git] / module / zfs / dsl_scrub.c
index 950a91f..03ebb90 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -45,6 +45,8 @@ typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
 
 static scrub_cb_t dsl_pool_scrub_clean_cb;
 static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
+static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
+    uint64_t objset, uint64_t object);
 
 int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
 int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
@@ -95,6 +97,9 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
                            ESC_ZFS_RESILVER_START);
                        dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
                            tx->tx_txg);
+               } else {
+                       spa_event_notify(dp->dp_spa, NULL,
+                           ESC_ZFS_SCRUB_START);
                }
 
                /* zero out the scrub stats in all vdev_stat_t's */
@@ -212,8 +217,9 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
         */
        vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
            *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
-       if (dp->dp_scrub_min_txg && *completep)
-               spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH);
+       if (*completep)
+               spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ?
+                   ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
        spa_errlog_rotate(dp->dp_spa);
 
        /*
@@ -344,6 +350,12 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
        if (bp->blk_birth <= dp->dp_scrub_min_txg)
                return;
 
+       /*
+        * One block ("stubby") can be allocated a long time ago; we
+        * want to visit that one because it has been allocated
+        * (on-disk) even if it hasn't been claimed (even though for
+        * plain scrub there's nothing to do to it).
+        */
        if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
                return;
 
@@ -369,6 +381,11 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
                if (bp->blk_birth <= dp->dp_scrub_min_txg)
                        return;
 
+               /*
+                * birth can be < claim_txg if this record's txg is
+                * already txg sync'ed (but this log block contains
+                * other records that are not synced)
+                */
                if (claim_txg == 0 || bp->blk_birth < claim_txg)
                        return;
 
@@ -391,7 +408,7 @@ traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
         * We only want to visit blocks that have been claimed but not yet
         * replayed (or, in read-only mode, blocks that *would* be claimed).
         */
-       if (claim_txg == 0 && (spa_mode & FWRITE))
+       if (claim_txg == 0 && spa_writeable(dp->dp_spa))
                return;
 
        zilog = zil_alloc(dp->dp_meta_objset, zh);
@@ -409,9 +426,6 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
        int err;
        arc_buf_t *buf = NULL;
 
-       if (bp->blk_birth == 0)
-               return;
-
        if (bp->blk_birth <= dp->dp_scrub_min_txg)
                return;
 
@@ -471,7 +485,7 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
        } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
                uint32_t flags = ARC_WAIT;
                dnode_phys_t *child_dnp;
-               int i, j;
+               int i;
                int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
                err = arc_read(NULL, dp->dp_spa, bp, pbuf,
@@ -486,20 +500,12 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
                child_dnp = buf->b_data;
 
                for (i = 0; i < epb; i++, child_dnp++) {
-                       for (j = 0; j < child_dnp->dn_nblkptr; j++) {
-                               zbookmark_t czb;
-
-                               SET_BOOKMARK(&czb, zb->zb_objset,
-                                   zb->zb_blkid * epb + i,
-                                   child_dnp->dn_nlevels - 1, j);
-                               scrub_visitbp(dp, child_dnp, buf,
-                                   &child_dnp->dn_blkptr[j], &czb);
-                       }
+                       scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset,
+                           zb->zb_blkid * epb + i);
                }
        } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
                uint32_t flags = ARC_WAIT;
                objset_phys_t *osp;
-               int j;
 
                err = arc_read_nolock(NULL, dp->dp_spa, bp,
                    arc_getbuf_func, &buf,
@@ -515,13 +521,13 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
 
                traverse_zil(dp, &osp->os_zil_header);
 
-               for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
-                       zbookmark_t czb;
-
-                       SET_BOOKMARK(&czb, zb->zb_objset, 0,
-                           osp->os_meta_dnode.dn_nlevels - 1, j);
-                       scrub_visitbp(dp, &osp->os_meta_dnode, buf,
-                           &osp->os_meta_dnode.dn_blkptr[j], &czb);
+               scrub_visitdnode(dp, &osp->os_meta_dnode,
+                   buf, zb->zb_objset, 0);
+               if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+                       scrub_visitdnode(dp, &osp->os_userused_dnode,
+                           buf, zb->zb_objset, 0);
+                       scrub_visitdnode(dp, &osp->os_groupused_dnode,
+                           buf, zb->zb_objset, 0);
                }
        }
 
@@ -531,6 +537,21 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
 }
 
 static void
+scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
+    uint64_t objset, uint64_t object)
+{
+       int j;
+
+       for (j = 0; j < dnp->dn_nblkptr; j++) {
+               zbookmark_t czb;
+
+               SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+               scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
+       }
+
+}
+
+static void
 scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
 {
        zbookmark_t zb;
@@ -740,6 +761,7 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 void
 dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
+       spa_t *spa = dp->dp_spa;
        zap_cursor_t zc;
        zap_attribute_t za;
        boolean_t complete = B_TRUE;
@@ -747,8 +769,10 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
        if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
                return;
 
-       /* If the spa is not fully loaded, don't bother. */
-       if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE)
+       /*
+        * If the pool is not loaded, or is trying to unload, leave it alone.
+        */
+       if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa))
                return;
 
        if (dp->dp_scrub_restart) {
@@ -757,13 +781,13 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
        }
 
-       if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
+       if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
                /*
                 * We must have resumed after rebooting; reset the vdev
                 * stats to know that we're doing a scrub (although it
                 * will think we're just starting now).
                 */
-               vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev,
+               vdev_scrub_stat_update(spa->spa_root_vdev,
                    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
                    POOL_SCRUB_EVERYTHING, B_FALSE);
        }
@@ -771,7 +795,7 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
        dp->dp_scrub_pausing = B_FALSE;
        dp->dp_scrub_start_time = lbolt64;
        dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
-       dp->dp_spa->spa_scrub_active = B_TRUE;
+       spa->spa_scrub_active = B_TRUE;
 
        if (dp->dp_scrub_bookmark.zb_objset == 0) {
                /* First do the MOS & ORIGIN */
@@ -779,8 +803,8 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                if (dp->dp_scrub_pausing)
                        goto out;
 
-               if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
-                       VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+               if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
+                       VERIFY(0 == dmu_objset_find_spa(spa,
                            NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
                } else {
                        scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
@@ -830,15 +854,13 @@ out:
        VERIFY(0 == zap_update(dp->dp_meta_objset,
            DMU_POOL_DIRECTORY_OBJECT,
            DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-           &dp->dp_spa->spa_scrub_errors, tx));
+           &spa->spa_scrub_errors, tx));
 
        /* XXX this is scrub-clean specific */
-       mutex_enter(&dp->dp_spa->spa_scrub_lock);
-       while (dp->dp_spa->spa_scrub_inflight > 0) {
-               cv_wait(&dp->dp_spa->spa_scrub_io_cv,
-                   &dp->dp_spa->spa_scrub_lock);
-       }
-       mutex_exit(&dp->dp_spa->spa_scrub_lock);
+       mutex_enter(&spa->spa_scrub_lock);
+       while (spa->spa_scrub_inflight > 0)
+               cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+       mutex_exit(&spa->spa_scrub_lock);
 }
 
 void
@@ -920,13 +942,17 @@ static int
 dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
     const blkptr_t *bp, const zbookmark_t *zb)
 {
-       size_t size = BP_GET_LSIZE(bp);
-       int d;
+       size_t size = BP_GET_PSIZE(bp);
        spa_t *spa = dp->dp_spa;
        boolean_t needs_io;
-       int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
+       int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
        int zio_priority;
 
+       ASSERT(bp->blk_birth > dp->dp_scrub_min_txg);
+
+       if (bp->blk_birth >= dp->dp_scrub_max_txg)
+               return (0);
+
        count_block(dp->dp_blkstats, bp);
 
        if (dp->dp_scrub_isresilver == 0) {
@@ -945,7 +971,7 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
        if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
                zio_flags |= ZIO_FLAG_SPECULATIVE;
 
-       for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+       for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
                vdev_t *vd = vdev_lookup_top(spa,
                    DVA_GET_VDEV(&bp->blk_dva[d]));
 
@@ -963,16 +989,17 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
                        if (DVA_GET_GANG(&bp->blk_dva[d])) {
                                /*
                                 * Gang members may be spread across multiple
-                                * vdevs, so the best we can do is look at the
-                                * pool-wide DTL.
+                                * vdevs, so the best estimate we have is the
+                                * scrub range, which has already been checked.
                                 * XXX -- it would be better to change our
-                                * allocation policy to ensure that this can't
-                                * happen.
+                                * allocation policy to ensure that all
+                                * gang members reside on the same vdev.
                                 */
-                               vd = spa->spa_root_vdev;
+                               needs_io = B_TRUE;
+                       } else {
+                               needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+                                   bp->blk_birth, 1);
                        }
-                       needs_io = vdev_dtl_contains(&vd->vdev_dtl_map,
-                           bp->blk_birth, 1);
                }
        }
 
@@ -997,6 +1024,8 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
 int
 dsl_pool_scrub_clean(dsl_pool_t *dp)
 {
+       spa_t *spa = dp->dp_spa;
+
        /*
         * Purge all vdev caches.  We do this here rather than in sync
         * context because this requires a writer lock on the spa_config
@@ -1004,11 +1033,11 @@ dsl_pool_scrub_clean(dsl_pool_t *dp)
         * spa_scrub_reopen flag indicates that vdev_open() should not
         * attempt to start another scrub.
         */
-       spa_config_enter(dp->dp_spa, SCL_ALL, FTAG, RW_WRITER);
-       dp->dp_spa->spa_scrub_reopen = B_TRUE;
-       vdev_reopen(dp->dp_spa->spa_root_vdev);
-       dp->dp_spa->spa_scrub_reopen = B_FALSE;
-       spa_config_exit(dp->dp_spa, SCL_ALL, FTAG);
+       spa_vdev_state_enter(spa);
+       spa->spa_scrub_reopen = B_TRUE;
+       vdev_reopen(spa->spa_root_vdev);
+       spa->spa_scrub_reopen = B_FALSE;
+       (void) spa_vdev_state_exit(spa, NULL, 0);
 
        return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
 }