X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fdsl_scan.c;h=34a4f0382d9afc054c7f8a486a31e9f3b03abc97;hb=294f68063b49c06d3118d51016811063e69cf97a;hp=ff36ea96b472d5cbe99dc80c31cb101cc439bcb6;hpb=40a39e1103bbbd6908bc1b5cf4af953c7e4706ad;p=zfs.git diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index ff36ea9..34a4f03 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -44,6 +45,7 @@ #include #include #include +#include #ifdef _KERNEL #include #endif @@ -62,8 +64,8 @@ int zfs_scan_idle = 50; /* idle window in clock ticks */ int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ -boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ -boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ +int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ +int zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */ @@ -71,8 +73,6 @@ int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) -extern int zfs_txg_timeout; - /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, @@ -212,7 +212,7 @@ dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) if (dp->dp_blkstats == NULL) { dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t), - KM_SLEEP | KM_NODEBUG); + KM_PUSHPAGE | KM_NODEBUG); } bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); @@ -346,8 +346,8 @@ static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx); -static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, - dmu_objset_type_t ostype, +inline __attribute__((always_inline)) static void dsl_scan_visitdnode( + dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx); void @@ -363,73 +363,6 @@ dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); } -int -dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb) -{ - return (arc_read(pio, spa, bpp, pbuf, done, private, - priority, zio_flags, arc_flags, zb)); -} - -int -dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb) -{ - return (arc_read_nolock(pio, spa, bpp, done, private, - priority, zio_flags, arc_flags, zb)); -} - -static boolean_t -bookmark_is_zero(const zbookmark_t *zb) -{ - return (zb->zb_objset == 0 && zb->zb_object == 0 && - zb->zb_level == 0 && zb->zb_blkid == 0); -} - -/* dnp is the dnode for zb1->zb_object */ -static boolean_t -bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, - const zbookmark_t *zb2) -{ - uint64_t zb1nextL0, zb2thisobj; - - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb2->zb_level == 0); - - /* - * A bookmark in the deadlist is considered to be after - * everything else. - */ - if (zb2->zb_object == DMU_DEADLIST_OBJECT) - return (B_TRUE); - - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); - - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); - - if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); - } - - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == DMU_META_DNODE_OBJECT) - return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); -} - static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { @@ -461,7 +394,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) if (scn->scn_pausing) return (B_TRUE); /* we're already pausing */ - if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark)) + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ @@ -600,12 +533,8 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); - /* - * XXX need to make sure all of these arc_read() prefetches are - * done before setting xlateall (similar to dsl_read()) - */ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, - buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); } @@ -616,13 +545,13 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, /* * We never skip over user/group accounting objects (obj<0) */ - if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) && + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ - if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) + if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* @@ -648,7 +577,7 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, * Return nonzero on i/o error. * Return new buf to write out in *bufp. */ -__attribute__((always_inline)) static int +inline __attribute__((always_inline)) static int dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) @@ -663,8 +592,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - err = arc_read_nolock(NULL, dp->dp_spa, bp, - arc_getbuf_func, bufp, + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; @@ -686,8 +614,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) { uint32_t flags = ARC_WAIT; - err = arc_read_nolock(NULL, dp->dp_spa, bp, - arc_getbuf_func, bufp, + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; @@ -699,8 +626,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, int i, j; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - err = arc_read_nolock(NULL, dp->dp_spa, bp, - arc_getbuf_func, bufp, + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; @@ -722,8 +648,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, uint32_t flags = ARC_WAIT; objset_phys_t *osp; - err = arc_read_nolock(NULL, dp->dp_spa, bp, - arc_getbuf_func, bufp, + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; @@ -754,7 +679,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, return (0); } -__attribute__((always_inline)) static void +inline __attribute__((always_inline)) static void dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx) @@ -793,7 +718,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, arc_buf_t *buf = NULL; blkptr_t *bp_toread; - bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + bp_toread = kmem_alloc(sizeof (blkptr_t), KM_PUSHPAGE); *bp_toread = *bp; /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ @@ -825,22 +750,6 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) goto out; - if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) { - /* - * For non-user-accounting blocks, we need to read the - * new bp (from a deleted snapshot, found in - * check_existing_xlation). If we used the old bp, - * pointers inside this block from before we resumed - * would be untranslated. - * - * For user-accounting blocks, we need to read the old - * bp, because we will apply the entire space delta to - * it (original untranslated -> translations from - * deleted snap -> now). - */ - *bp_toread = *bp; - } - if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx, &buf) != 0) goto out; @@ -1110,7 +1019,7 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx); - dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); + dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_PUSHPAGE); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " "pausing=%u", @@ -1377,8 +1286,8 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) * bookmark so we don't think that we're still trying to resume. */ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t)); - zc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP); - za = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP); + zc = kmem_alloc(sizeof(zap_cursor_t), KM_PUSHPAGE); + za = kmem_alloc(sizeof(zap_attribute_t), KM_PUSHPAGE); /* keep pulling things out of the zap-object-as-queue */ while (zap_cursor_init(zc, dp->dp_meta_objset, @@ -1416,19 +1325,28 @@ out: kmem_free(zc, sizeof(zap_cursor_t)); } -static int -dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +static boolean_t +dsl_scan_free_should_pause(dsl_scan_t *scn) { - dsl_scan_t *scn = arg; uint64_t elapsed_nanosecs; elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; - - if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && txg_sync_waiting(scn->scn_dp)) || - spa_shutting_down(scn->scn_dp->dp_spa)) - return (ERESTART); + spa_shutting_down(scn->scn_dp->dp_spa)); +} + +static int +dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg; + + if (!scn->scn_is_bptree || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { + if (dsl_scan_free_should_pause(scn)) + return (ERESTART); + } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, 0)); @@ -1453,6 +1371,10 @@ dsl_scan_active(dsl_scan_t *scn) if (scn->scn_phys.scn_state == DSS_SCANNING) return (B_TRUE); + if (spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + return (B_TRUE); + } if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); @@ -1499,14 +1421,40 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) * traversing it. */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + scn->scn_is_bptree = B_FALSE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, - dsl_scan_free_cb, scn, tx); + dsl_scan_free_block_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + + if (err == 0 && spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + scn->scn_is_bptree = B_TRUE; + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bptree_iterate(dp->dp_meta_objset, + dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, + scn, tx); + VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + if (err != 0) + return; + + /* disable async destroy feature */ + spa_feature_decr(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx); + ASSERT(!spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])); + VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, tx)); + VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset, + dp->dp_bptree_obj, tx)); + dp->dp_bptree_obj = 0; + } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " - "free_bpobj txg %llu", + "free_bpobj/bptree txg %llu", (longlong_t)scn->scn_visited_this_txg, (longlong_t) (gethrtime() - scn->scn_sync_start_time) / MICROSEC, @@ -1621,9 +1569,13 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; - zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; + zfs_blkstat_t *zb; + + if (t & DMU_OT_NEWTYPE) + t = DMU_OT_OTHER; + zb = &zab->zab_type[l][t]; zb->zb_count++; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); @@ -1785,3 +1737,32 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) return (dsl_sync_task_do(dp, dsl_scan_setup_check, dsl_scan_setup_sync, dp->dp_scan, &func, 0)); } + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_top_maxinflight, int, 0644); +MODULE_PARM_DESC(zfs_top_maxinflight, "Max I/Os per top-level"); + +module_param(zfs_resilver_delay, int, 0644); +MODULE_PARM_DESC(zfs_resilver_delay, "Number of ticks to delay resilver"); + +module_param(zfs_scrub_delay, int, 0644); +MODULE_PARM_DESC(zfs_scrub_delay, "Number of ticks to delay scrub"); + +module_param(zfs_scan_idle, int, 0644); +MODULE_PARM_DESC(zfs_scan_idle, "Idle window in clock ticks"); + +module_param(zfs_scan_min_time_ms, int, 0644); +MODULE_PARM_DESC(zfs_scan_min_time_ms, "Min millisecs to scrub per txg"); + +module_param(zfs_free_min_time_ms, int, 0644); +MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg"); + +module_param(zfs_resilver_min_time_ms, int, 0644); +MODULE_PARM_DESC(zfs_resilver_min_time_ms, "Min millisecs to resilver per txg"); + +module_param(zfs_no_scrub_io, int, 0644); +MODULE_PARM_DESC(zfs_no_scrub_io, "Set to disable scrub I/O"); + +module_param(zfs_no_scrub_prefetch, int, 0644); +MODULE_PARM_DESC(zfs_no_scrub_prefetch, "Set to disable scrub prefetching"); +#endif