X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fdsl_scan.c;h=107c81b962aa862059fe8147999d175e639a246e;hb=95c73795b001267d6b683b71e8abe51de4b0c938;hp=23c37c7ccfd23923c2fdbaed8e466eb740c6e631;hpb=428870ff734fdaccc342b33fc53cf94724409a46;p=zfs.git diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 23c37c7..107c81b 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -50,12 +50,15 @@ typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); -static scan_cb_t dsl_scan_defrag_cb; static scan_cb_t dsl_scan_scrub_cb; -static scan_cb_t dsl_scan_remove_cb; static dsl_syncfunc_t dsl_scan_cancel_sync; static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); +int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ +int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ +int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ +int zfs_scan_idle = 50; /* idle window in clock ticks */ + int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ @@ -189,9 +192,9 @@ dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) if (vdev_resilver_needed(spa->spa_root_vdev, &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { - spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); + spa_event_notify(spa, NULL, FM_EREPORT_ZFS_RESILVER_START); } else { - spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START); + spa_event_notify(spa, NULL, FM_EREPORT_ZFS_SCRUB_START); } spa->spa_scrub_started = B_TRUE; @@ -208,8 +211,8 @@ dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) /* back to the generic stuff */ if (dp->dp_blkstats == NULL) { - dp->dp_blkstats = - kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t), + KM_SLEEP | KM_NODEBUG); } bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); @@ -292,7 +295,8 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); if (complete) { spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? - ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + FM_EREPORT_ZFS_RESILVER_FINISH : + FM_EREPORT_ZFS_SCRUB_FINISH); } spa_errlog_rotate(spa); @@ -601,8 +605,8 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, * done before setting xlateall (similar to dsl_read()) */ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, - buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, - &flags, &czb); + buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); } static boolean_t @@ -650,6 +654,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) { dsl_pool_t *dp = scn->scn_dp; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; if (BP_GET_LEVEL(bp) > 0) { @@ -660,7 +665,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -683,7 +688,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -696,7 +701,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -719,7 +724,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -727,9 +732,6 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, osp = (*bufp)->b_data; - if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) - dsl_scan_zil(dp, &osp->os_zil_header); - dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx); @@ -789,18 +791,21 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, { dsl_pool_t *dp = scn->scn_dp; arc_buf_t *buf = NULL; - blkptr_t bp_toread = *bp; + blkptr_t *bp_toread; + + bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + *bp_toread = *bp; /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ if (dsl_scan_check_pause(scn, zb)) - return; + goto out; if (dsl_scan_check_resume(scn, dnp, zb)) - return; + goto out; if (bp->blk_birth == 0) - return; + goto out; scn->scn_visited_this_txg++; @@ -811,7 +816,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, pbuf, bp); if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) - return; + goto out; if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) { /* @@ -826,12 +831,12 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, * it (original untranslated -> translations from * deleted snap -> now). */ - bp_toread = *bp; + *bp_toread = *bp; } - if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx, + if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx, &buf) != 0) - return; + goto out; /* * If dsl_scan_ddt() has aready visited this block, it will have @@ -841,7 +846,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { ASSERT(buf == NULL); - return; + goto out; } /* @@ -856,6 +861,8 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, } if (buf) (void) arc_buf_remove_ref(buf, &buf); +out: + kmem_free(bp_toread, sizeof(blkptr_t)); } static void @@ -1072,16 +1079,31 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; + objset_t *os; + char *dsname; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + if (dmu_objset_from_ds(ds, &os)) + goto out; + + /* + * Only the ZIL in the head (non-snapshot) is valid. Even though + * snapshots can have ZIL block pointers (which may be the same + * BP as in the head), they must be ignored. So we traverse the + * ZIL here, rather than in scan_recurse(), because the regular + * snapshot block-sharing rules don't apply to it. + */ + if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds)) + dsl_scan_zil(dp, &os->os_zil_header); + /* * Iterate over the bps in this ds. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx); - char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); + dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " "pausing=%u", @@ -1233,10 +1255,12 @@ static void dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) { ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; - ddt_entry_t dde = { 0 }; + ddt_entry_t dde; int error; uint64_t n = 0; + bzero(&dde, sizeof (ddt_entry_t)); + while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { ddt_t *ddt; @@ -1277,11 +1301,12 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ddt_phys_t *ddp = dde->dde_phys; blkptr_t bp; zbookmark_t zb = { 0 }; + int p; if (scn->scn_phys.scn_state != DSS_SCANNING) return; - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg) continue; @@ -1446,7 +1471,6 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_scan_setup_sync(scn, &func, tx); } - if (!dsl_scan_active(scn) || spa_sync_pass(dp->dp_spa) > 1) return; @@ -1489,7 +1513,6 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (scn->scn_phys.scn_state != DSS_SCANNING) return; - if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { zfs_dbgmsg("doing scan sync txg %llu; " @@ -1643,9 +1666,11 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, size_t size = BP_GET_PSIZE(bp); spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); - boolean_t needs_io; - int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; - int zio_priority; + boolean_t needs_io = B_FALSE; + int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; + int zio_priority = 0; + int scan_delay = 0; + int d; if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) @@ -1658,17 +1683,19 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, zio_flags |= ZIO_FLAG_SCRUB; zio_priority = ZIO_PRIORITY_SCRUB; needs_io = B_TRUE; + scan_delay = zfs_scrub_delay; } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { zio_flags |= ZIO_FLAG_RESILVER; zio_priority = ZIO_PRIORITY_RESILVER; needs_io = B_FALSE; + scan_delay = zfs_resilver_delay; } /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; - for (int d = 0; d < BP_GET_NDVAS(bp); d++) { + for (d = 0; d < BP_GET_NDVAS(bp); d++) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[d])); @@ -1699,14 +1726,23 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, } if (needs_io && !zfs_no_scrub_io) { + vdev_t *rvd = spa->spa_root_vdev; + uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; void *data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) + while (spa->spa_scrub_inflight >= maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); + /* + * If we're seeing recent (zfs_scan_idle) "important" I/Os + * then throttle our workload to limit the impact of a scan. + */ + if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) + delay(scan_delay); + zio_nowait(zio_read(NULL, spa, bp, data, size, dsl_scan_scrub_done, NULL, zio_priority, zio_flags, zb));