*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
*/
#include <sys/dsl_scan.h>
#include <sys/ddt.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
#ifdef _KERNEL
#include <sys/zfs_vfsops.h>
#endif
((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
-extern int zfs_txg_timeout;
-
/* the order has to match pool_scan_type */
static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
NULL,
if (dp->dp_blkstats == NULL) {
dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t),
- KM_SLEEP | KM_NODEBUG);
+ KM_PUSHPAGE | KM_NODEBUG);
}
bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
}
-int
-dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
- arc_done_func_t *done, void *private, int priority, int zio_flags,
- uint32_t *arc_flags, const zbookmark_t *zb)
-{
- return (arc_read(pio, spa, bpp, pbuf, done, private,
- priority, zio_flags, arc_flags, zb));
-}
-
-int
-dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
- arc_done_func_t *done, void *private, int priority, int zio_flags,
- uint32_t *arc_flags, const zbookmark_t *zb)
-{
- return (arc_read_nolock(pio, spa, bpp, done, private,
- priority, zio_flags, arc_flags, zb));
-}
-
-static boolean_t
-bookmark_is_zero(const zbookmark_t *zb)
-{
- return (zb->zb_objset == 0 && zb->zb_object == 0 &&
- zb->zb_level == 0 && zb->zb_blkid == 0);
-}
-
-/* dnp is the dnode for zb1->zb_object */
-static boolean_t
-bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
- const zbookmark_t *zb2)
-{
- uint64_t zb1nextL0, zb2thisobj;
-
- ASSERT(zb1->zb_objset == zb2->zb_objset);
- ASSERT(zb2->zb_level == 0);
-
- /*
- * A bookmark in the deadlist is considered to be after
- * everything else.
- */
- if (zb2->zb_object == DMU_DEADLIST_OBJECT)
- return (B_TRUE);
-
- /* The objset_phys_t isn't before anything. */
- if (dnp == NULL)
- return (B_FALSE);
-
- zb1nextL0 = (zb1->zb_blkid + 1) <<
- ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
-
- zb2thisobj = zb2->zb_object ? zb2->zb_object :
- zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
-
- if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
- uint64_t nextobj = zb1nextL0 *
- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
- return (nextobj <= zb2thisobj);
- }
-
- if (zb1->zb_object < zb2thisobj)
- return (B_TRUE);
- if (zb1->zb_object > zb2thisobj)
- return (B_FALSE);
- if (zb2->zb_object == DMU_META_DNODE_OBJECT)
- return (B_FALSE);
- return (zb1nextL0 <= zb2->zb_blkid);
-}
-
static uint64_t
dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
{
if (scn->scn_pausing)
return (B_TRUE); /* we're already pausing */
- if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
+ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
return (B_FALSE); /* we're resuming */
/* We only know how to resume from level-0 blocks. */
SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
- /*
- * XXX need to make sure all of these arc_read() prefetches are
- * done before setting xlateall (similar to dsl_read())
- */
(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
- buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
}
/*
* We never skip over user/group accounting objects (obj<0)
*/
- if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
+ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
(int64_t)zb->zb_object >= 0) {
/*
* If we already visited this bp & everything below (in
* a prior txg sync), don't bother doing it again.
*/
- if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+ if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
return (B_TRUE);
/*
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
- err = arc_read_nolock(NULL, dp->dp_spa, bp,
- arc_getbuf_func, bufp,
+ err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
} else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
uint32_t flags = ARC_WAIT;
- err = arc_read_nolock(NULL, dp->dp_spa, bp,
- arc_getbuf_func, bufp,
+ err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
int i, j;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
- err = arc_read_nolock(NULL, dp->dp_spa, bp,
- arc_getbuf_func, bufp,
+ err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
uint32_t flags = ARC_WAIT;
objset_phys_t *osp;
- err = arc_read_nolock(NULL, dp->dp_spa, bp,
- arc_getbuf_func, bufp,
+ err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
arc_buf_t *buf = NULL;
blkptr_t *bp_toread;
- bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+ bp_toread = kmem_alloc(sizeof (blkptr_t), KM_PUSHPAGE);
*bp_toread = *bp;
/* ASSERT(pbuf == NULL || arc_released(pbuf)); */
if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
goto out;
- if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
- /*
- * For non-user-accounting blocks, we need to read the
- * new bp (from a deleted snapshot, found in
- * check_existing_xlation). If we used the old bp,
- * pointers inside this block from before we resumed
- * would be untranslated.
- *
- * For user-accounting blocks, we need to read the old
- * bp, because we will apply the entire space delta to
- * it (original untranslated -> translations from
- * deleted snap -> now).
- */
- *bp_toread = *bp;
- }
-
if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx,
&buf) != 0)
goto out;
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
- dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
+ dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_PUSHPAGE);
dsl_dataset_name(ds, dsname);
zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
"pausing=%u",
* bookmark so we don't think that we're still trying to resume.
*/
bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
- zc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP);
- za = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP);
+ zc = kmem_alloc(sizeof(zap_cursor_t), KM_PUSHPAGE);
+ za = kmem_alloc(sizeof(zap_attribute_t), KM_PUSHPAGE);
/* keep pulling things out of the zap-object-as-queue */
while (zap_cursor_init(zc, dp->dp_meta_objset,
kmem_free(zc, sizeof(zap_cursor_t));
}
-static int
-dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+static boolean_t
+dsl_scan_free_should_pause(dsl_scan_t *scn)
{
- dsl_scan_t *scn = arg;
uint64_t elapsed_nanosecs;
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
-
- if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+ return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
(elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
txg_sync_waiting(scn->scn_dp)) ||
- spa_shutting_down(scn->scn_dp->dp_spa))
- return (ERESTART);
+ spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+static int
+dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg;
+
+ if (!scn->scn_is_bptree ||
+ (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
+ if (dsl_scan_free_should_pause(scn))
+ return (ERESTART);
+ }
zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
dmu_tx_get_txg(tx), bp, 0));
if (scn->scn_phys.scn_state == DSS_SCANNING)
return (B_TRUE);
+ if (spa_feature_is_active(spa,
+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+ return (B_TRUE);
+ }
if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
&used, &comp, &uncomp);
* traversing it.
*/
if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ scn->scn_is_bptree = B_FALSE;
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED);
err = bpobj_iterate(&dp->dp_free_bpobj,
- dsl_scan_free_cb, scn, tx);
+ dsl_scan_free_block_cb, scn, tx);
VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+
+ if (err == 0 && spa_feature_is_active(spa,
+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+ scn->scn_is_bptree = B_TRUE;
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_MUSTSUCCEED);
+ err = bptree_iterate(dp->dp_meta_objset,
+ dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
+ scn, tx);
+ VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+ if (err != 0)
+ return;
+
+ /* disable async destroy feature */
+ spa_feature_decr(spa,
+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx);
+ ASSERT(!spa_feature_is_active(spa,
+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]));
+ VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_BPTREE_OBJ, tx));
+ VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset,
+ dp->dp_bptree_obj, tx));
+ dp->dp_bptree_obj = 0;
+ }
if (scn->scn_visited_this_txg) {
zfs_dbgmsg("freed %llu blocks in %llums from "
- "free_bpobj txg %llu",
+ "free_bpobj/bptree txg %llu",
(longlong_t)scn->scn_visited_this_txg,
(longlong_t)
(gethrtime() - scn->scn_sync_start_time) / MICROSEC,
for (i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
- zfs_blkstat_t *zb = &zab->zab_type[l][t];
int equal;
+ zfs_blkstat_t *zb;
+
+ if (t & DMU_OT_NEWTYPE)
+ t = DMU_OT_OTHER;
+ zb = &zab->zab_type[l][t];
zb->zb_count++;
zb->zb_asize += BP_GET_ASIZE(bp);
zb->zb_lsize += BP_GET_LSIZE(bp);
module_param(zfs_no_scrub_prefetch, int, 0644);
MODULE_PARM_DESC(zfs_no_scrub_prefetch, "Set to disable scrub prefetching");
-
-module_param(zfs_txg_timeout, int, 0644);
-MODULE_PARM_DESC(zfs_txg_timeout, "Max seconds worth of delta per txg");
#endif