X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fmetaslab.c;h=d199921b7edf1a17d6bc7ab2c20275941a340d91;hb=36f86f73f68548f46eb3229c8adf583d59fa9988;hp=412832968dd3b82325b215c2b59dc5ce63cfbd5f;hpb=fb5f0bc83330c8a0236c4d34a23723ac1974971a;p=zfs.git diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 4128329..d199921 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -19,12 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include -#include #include #include #include @@ -32,22 +31,83 @@ #include #include +#define WITH_DF_BLOCK_ALLOCATOR + +/* + * Allow allocations to switch to gang blocks quickly. We do this to + * avoid having to load lots of space_maps in a given txg. There are, + * however, some cases where we want to avoid "fast" ganging and instead + * we want to do an exhaustive search of all metaslabs on this device. + * Currently we don't allow any gang, zil, or dump device related allocations + * to "fast" gang. + */ +#define CAN_FASTGANG(flags) \ + (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ + METASLAB_GANG_AVOID))) + uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* + * This value defines the number of allowed allocation failures per vdev. + * If a device reaches this threshold in a given txg then we consider skipping + * allocations on that device. + */ +int zfs_mg_alloc_failures; + +/* + * Metaslab debugging: when set, keeps all space maps in core to verify frees. + */ +static int metaslab_debug = 0; + +/* + * Minimum size which forces the dynamic allocator to change + * it's allocation strategy. Once the space map cannot satisfy + * an allocation of this size then it switches to using more + * aggressive strategy (i.e search by size rather than offset). + */ +uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; + +/* + * The minimum free space, in percent, which must be available + * in a space map to continue allocations in a first-fit fashion. + * Once the space_map's free space drops below this level we dynamically + * switch to using best-fit allocations. + */ +int metaslab_df_free_pct = 4; + +/* + * A metaslab is considered "free" if it contains a contiguous + * segment which is greater than metaslab_min_alloc_size. + */ +uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; + +/* + * Max number of space_maps to prefetch. + */ +int metaslab_prefetch_limit = SPA_DVAS_PER_BP; + +/* + * Percentage bonus multiplier for metaslabs that are in the bonus area. + */ +int metaslab_smo_bonus_pct = 150; + +/* * ========================================================================== * Metaslab classes * ========================================================================== */ metaslab_class_t * -metaslab_class_create(void) +metaslab_class_create(spa_t *spa, space_map_ops_t *ops) { metaslab_class_t *mc; - mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); + mc = kmem_zalloc(sizeof (metaslab_class_t), KM_PUSHPAGE); + mc->mc_spa = spa; mc->mc_rotor = NULL; + mc->mc_ops = ops; + mutex_init(&mc->mc_fastwrite_lock, NULL, MUTEX_DEFAULT, NULL); return (mc); } @@ -55,58 +115,74 @@ metaslab_class_create(void) void metaslab_class_destroy(metaslab_class_t *mc) { - metaslab_group_t *mg; - - while ((mg = mc->mc_rotor) != NULL) { - metaslab_class_remove(mc, mg); - metaslab_group_destroy(mg); - } + ASSERT(mc->mc_rotor == NULL); + ASSERT(mc->mc_alloc == 0); + ASSERT(mc->mc_deferred == 0); + ASSERT(mc->mc_space == 0); + ASSERT(mc->mc_dspace == 0); + mutex_destroy(&mc->mc_fastwrite_lock); kmem_free(mc, sizeof (metaslab_class_t)); } -void -metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) +int +metaslab_class_validate(metaslab_class_t *mc) { - metaslab_group_t *mgprev, *mgnext; + metaslab_group_t *mg; + vdev_t *vd; - ASSERT(mg->mg_class == NULL); + /* + * Must hold one of the spa_config locks. + */ + ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || + spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); - if ((mgprev = mc->mc_rotor) == NULL) { - mg->mg_prev = mg; - mg->mg_next = mg; - } else { - mgnext = mgprev->mg_next; - mg->mg_prev = mgprev; - mg->mg_next = mgnext; - mgprev->mg_next = mg; - mgnext->mg_prev = mg; - } - mc->mc_rotor = mg; - mg->mg_class = mc; + if ((mg = mc->mc_rotor) == NULL) + return (0); + + do { + vd = mg->mg_vd; + ASSERT(vd->vdev_mg != NULL); + ASSERT3P(vd->vdev_top, ==, vd); + ASSERT3P(mg->mg_class, ==, mc); + ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); + } while ((mg = mg->mg_next) != mc->mc_rotor); + + return (0); } void -metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) +metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, + int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { - metaslab_group_t *mgprev, *mgnext; + atomic_add_64(&mc->mc_alloc, alloc_delta); + atomic_add_64(&mc->mc_deferred, defer_delta); + atomic_add_64(&mc->mc_space, space_delta); + atomic_add_64(&mc->mc_dspace, dspace_delta); +} - ASSERT(mg->mg_class == mc); +uint64_t +metaslab_class_get_alloc(metaslab_class_t *mc) +{ + return (mc->mc_alloc); +} - mgprev = mg->mg_prev; - mgnext = mg->mg_next; +uint64_t +metaslab_class_get_deferred(metaslab_class_t *mc) +{ + return (mc->mc_deferred); +} - if (mg == mgnext) { - mc->mc_rotor = NULL; - } else { - mc->mc_rotor = mgnext; - mgprev->mg_next = mgnext; - mgnext->mg_prev = mgprev; - } +uint64_t +metaslab_class_get_space(metaslab_class_t *mc) +{ + return (mc->mc_space); +} - mg->mg_prev = NULL; - mg->mg_next = NULL; - mg->mg_class = NULL; +uint64_t +metaslab_class_get_dspace(metaslab_class_t *mc) +{ + return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } /* @@ -143,13 +219,13 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) { metaslab_group_t *mg; - mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); + mg = kmem_zalloc(sizeof (metaslab_group_t), KM_PUSHPAGE); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); - mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children); mg->mg_vd = vd; - metaslab_class_add(mc, mg); + mg->mg_class = mc; + mg->mg_activation_count = 0; return (mg); } @@ -157,11 +233,82 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) void metaslab_group_destroy(metaslab_group_t *mg) { + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + /* + * We may have gone below zero with the activation count + * either because we never activated in the first place or + * because we're done, and possibly removing the vdev. + */ + ASSERT(mg->mg_activation_count <= 0); + avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); kmem_free(mg, sizeof (metaslab_group_t)); } +void +metaslab_group_activate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + metaslab_group_t *mgprev, *mgnext; + + ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count <= 0); + + if (++mg->mg_activation_count <= 0) + return; + + mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); + + if ((mgprev = mc->mc_rotor) == NULL) { + mg->mg_prev = mg; + mg->mg_next = mg; + } else { + mgnext = mgprev->mg_next; + mg->mg_prev = mgprev; + mg->mg_next = mgnext; + mgprev->mg_next = mg; + mgnext->mg_prev = mg; + } + mc->mc_rotor = mg; +} + +void +metaslab_group_passivate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + metaslab_group_t *mgprev, *mgnext; + + ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + + if (--mg->mg_activation_count != 0) { + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count < 0); + return; + } + + mgprev = mg->mg_prev; + mgnext = mg->mg_next; + + if (mg == mgnext) { + mc->mc_rotor = NULL; + } else { + mc->mc_rotor = mgnext; + mgprev->mg_next = mgnext; + mgnext->mg_prev = mgprev; + } + + mg->mg_prev = NULL; + mg->mg_next = NULL; +} + static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { @@ -203,29 +350,42 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) /* * ========================================================================== - * The first-fit block allocator + * Common allocator routines * ========================================================================== */ -static void -metaslab_ff_load(space_map_t *sm) +static int +metaslab_segsize_compare(const void *x1, const void *x2) { - ASSERT(sm->sm_ppd == NULL); - sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); -} + const space_seg_t *s1 = x1; + const space_seg_t *s2 = x2; + uint64_t ss_size1 = s1->ss_end - s1->ss_start; + uint64_t ss_size2 = s2->ss_end - s2->ss_start; -static void -metaslab_ff_unload(space_map_t *sm) -{ - kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); - sm->sm_ppd = NULL; + if (ss_size1 < ss_size2) + return (-1); + if (ss_size1 > ss_size2) + return (1); + + if (s1->ss_start < s2->ss_start) + return (-1); + if (s1->ss_start > s2->ss_start) + return (1); + + return (0); } +#if defined(WITH_FF_BLOCK_ALLOCATOR) || \ + defined(WITH_DF_BLOCK_ALLOCATOR) || \ + defined(WITH_CDF_BLOCK_ALLOCATOR) +/* + * This is a helper function that can be used by the allocator to find + * a suitable block to allocate. This will search the specified AVL + * tree looking for a block that matches the specified criteria. + */ static uint64_t -metaslab_ff_alloc(space_map_t *sm, uint64_t size) +metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, + uint64_t align) { - avl_tree_t *t = &sm->sm_root; - uint64_t align = size & -size; - uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; space_seg_t *ss, ssearch; avl_index_t where; @@ -254,31 +414,307 @@ metaslab_ff_alloc(space_map_t *sm, uint64_t size) return (-1ULL); *cursor = 0; - return (metaslab_ff_alloc(sm, size)); + return (metaslab_block_picker(t, cursor, size, align)); +} +#endif /* WITH_FF/DF/CDF_BLOCK_ALLOCATOR */ + +static void +metaslab_pp_load(space_map_t *sm) +{ + space_seg_t *ss; + + ASSERT(sm->sm_ppd == NULL); + sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_PUSHPAGE); + + sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_PUSHPAGE); + avl_create(sm->sm_pp_root, metaslab_segsize_compare, + sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); + + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + avl_add(sm->sm_pp_root, ss); +} + +static void +metaslab_pp_unload(space_map_t *sm) +{ + void *cookie = NULL; + + kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); + sm->sm_ppd = NULL; + + while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { + /* tear down the tree */ + } + + avl_destroy(sm->sm_pp_root); + kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); + sm->sm_pp_root = NULL; } /* ARGSUSED */ static void -metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size) +metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) { /* No need to update cursor */ } /* ARGSUSED */ static void -metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size) +metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) { /* No need to update cursor */ } +/* + * Return the maximum contiguous segment within the metaslab. + */ +uint64_t +metaslab_pp_maxsize(space_map_t *sm) +{ + avl_tree_t *t = sm->sm_pp_root; + space_seg_t *ss; + + if (t == NULL || (ss = avl_last(t)) == NULL) + return (0ULL); + + return (ss->ss_end - ss->ss_start); +} + +#if defined(WITH_FF_BLOCK_ALLOCATOR) +/* + * ========================================================================== + * The first-fit block allocator + * ========================================================================== + */ +static uint64_t +metaslab_ff_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + + return (metaslab_block_picker(t, cursor, size, align)); +} + +/* ARGSUSED */ +boolean_t +metaslab_ff_fragmented(space_map_t *sm) +{ + return (B_TRUE); +} + static space_map_ops_t metaslab_ff_ops = { - metaslab_ff_load, - metaslab_ff_unload, + metaslab_pp_load, + metaslab_pp_unload, metaslab_ff_alloc, - metaslab_ff_claim, - metaslab_ff_free + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_ff_fragmented +}; + +space_map_ops_t *zfs_metaslab_ops = &metaslab_ff_ops; +#endif /* WITH_FF_BLOCK_ALLOCATOR */ + +#if defined(WITH_DF_BLOCK_ALLOCATOR) +/* + * ========================================================================== + * Dynamic block allocator - + * Uses the first fit allocation scheme until space get low and then + * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold + * and metaslab_df_free_pct to determine when to switch the allocation scheme. + * ========================================================================== + */ +static uint64_t +metaslab_df_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + uint64_t max_size = metaslab_pp_maxsize(sm); + int free_pct = sm->sm_space * 100 / sm->sm_size; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + + if (max_size < size) + return (-1ULL); + + /* + * If we're running low on space switch to using the size + * sorted AVL tree (best-fit). + */ + if (max_size < metaslab_df_alloc_threshold || + free_pct < metaslab_df_free_pct) { + t = sm->sm_pp_root; + *cursor = 0; + } + + return (metaslab_block_picker(t, cursor, size, 1ULL)); +} + +static boolean_t +metaslab_df_fragmented(space_map_t *sm) +{ + uint64_t max_size = metaslab_pp_maxsize(sm); + int free_pct = sm->sm_space * 100 / sm->sm_size; + + if (max_size >= metaslab_df_alloc_threshold && + free_pct >= metaslab_df_free_pct) + return (B_FALSE); + + return (B_TRUE); +} + +static space_map_ops_t metaslab_df_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_df_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_df_fragmented +}; + +space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; +#endif /* WITH_DF_BLOCK_ALLOCATOR */ + +/* + * ========================================================================== + * Other experimental allocators + * ========================================================================== + */ +#if defined(WITH_CDF_BLOCK_ALLOCATOR) +static uint64_t +metaslab_cdf_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t *cursor = (uint64_t *)sm->sm_ppd; + uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; + uint64_t max_size = metaslab_pp_maxsize(sm); + uint64_t rsize = size; + uint64_t offset = 0; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + + if (max_size < size) + return (-1ULL); + + ASSERT3U(*extent_end, >=, *cursor); + + /* + * If we're running low on space switch to using the size + * sorted AVL tree (best-fit). + */ + if ((*cursor + size) > *extent_end) { + + t = sm->sm_pp_root; + *cursor = *extent_end = 0; + + if (max_size > 2 * SPA_MAXBLOCKSIZE) + rsize = MIN(metaslab_min_alloc_size, max_size); + offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); + if (offset != -1) + *cursor = offset + size; + } else { + offset = metaslab_block_picker(t, cursor, rsize, 1ULL); + } + ASSERT3U(*cursor, <=, *extent_end); + return (offset); +} + +static boolean_t +metaslab_cdf_fragmented(space_map_t *sm) +{ + uint64_t max_size = metaslab_pp_maxsize(sm); + + if (max_size > (metaslab_min_alloc_size * 10)) + return (B_FALSE); + return (B_TRUE); +} + +static space_map_ops_t metaslab_cdf_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_cdf_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_cdf_fragmented +}; + +space_map_ops_t *zfs_metaslab_ops = &metaslab_cdf_ops; +#endif /* WITH_CDF_BLOCK_ALLOCATOR */ + +#if defined(WITH_NDF_BLOCK_ALLOCATOR) +uint64_t metaslab_ndf_clump_shift = 4; + +static uint64_t +metaslab_ndf_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + avl_index_t where; + space_seg_t *ss, ssearch; + uint64_t hbit = highbit(size); + uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; + uint64_t max_size = metaslab_pp_maxsize(sm); + + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + + if (max_size < size) + return (-1ULL); + + ssearch.ss_start = *cursor; + ssearch.ss_end = *cursor + size; + + ss = avl_find(t, &ssearch, &where); + if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { + t = sm->sm_pp_root; + + ssearch.ss_start = 0; + ssearch.ss_end = MIN(max_size, + 1ULL << (hbit + metaslab_ndf_clump_shift)); + ss = avl_find(t, &ssearch, &where); + if (ss == NULL) + ss = avl_nearest(t, where, AVL_AFTER); + ASSERT(ss != NULL); + } + + if (ss != NULL) { + if (ss->ss_start + size <= ss->ss_end) { + *cursor = ss->ss_start + size; + return (ss->ss_start); + } + } + return (-1ULL); +} + +static boolean_t +metaslab_ndf_fragmented(space_map_t *sm) +{ + uint64_t max_size = metaslab_pp_maxsize(sm); + + if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) + return (B_FALSE); + return (B_TRUE); +} + + +static space_map_ops_t metaslab_ndf_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_ndf_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_ndf_fragmented }; +space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; +#endif /* WITH_NDF_BLOCK_ALLOCATOR */ + /* * ========================================================================== * Metaslabs @@ -291,7 +727,7 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, vdev_t *vd = mg->mg_vd; metaslab_t *msp; - msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); + msp = kmem_zalloc(sizeof (metaslab_t), KM_PUSHPAGE); mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); msp->ms_smo_syncing = *smo; @@ -308,6 +744,13 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_group_add(mg, msp); + if (metaslab_debug && smo->smo_object != 0) { + mutex_enter(&msp->ms_lock); + VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, + SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); + mutex_exit(&msp->ms_lock); + } + /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. @@ -318,16 +761,8 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_sync_done(msp, 0); if (txg != 0) { - /* - * The vdev is dirty, but the metaslab isn't -- it just needs - * to have metaslab_sync_done() invoked from vdev_sync_done(). - * [We could just dirty the metaslab, but that would cause us - * to allocate a space map object for it, which is wasteful - * and would mess up the locality logic in metaslab_weight().] - */ - ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa)); vdev_dirty(vd, 0, NULL, txg); - vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg)); + vdev_dirty(vd, VDD_METASLAB, msp, txg); } return (msp); @@ -339,8 +774,8 @@ metaslab_fini(metaslab_t *msp) metaslab_group_t *mg = msp->ms_group; int t; - vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, - -msp->ms_smo.smo_alloc, B_TRUE); + vdev_space_update(mg->mg_vd, + -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); metaslab_group_remove(mg, msp); @@ -354,6 +789,11 @@ metaslab_fini(metaslab_t *msp) space_map_destroy(&msp->ms_freemap[t]); } + for (t = 0; t < TXG_DEFER_SIZE; t++) + space_map_destroy(&msp->ms_defermap[t]); + + ASSERT3S(msp->ms_deferspace, ==, 0); + mutex_exit(&msp->ms_lock); mutex_destroy(&msp->ms_lock); @@ -364,7 +804,6 @@ metaslab_fini(metaslab_t *msp) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) #define METASLAB_ACTIVE_MASK \ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) -#define METASLAB_SMO_BONUS_MULTIPLIER 2 static uint64_t metaslab_weight(metaslab_t *msp) @@ -397,37 +836,91 @@ metaslab_weight(metaslab_t *msp) ASSERT(weight >= space && weight <= 2 * space); /* - * For locality, assign higher weight to metaslabs we've used before. + * For locality, assign higher weight to metaslabs which have + * a lower offset than what we've already activated. */ - if (smo->smo_object != 0) - weight *= METASLAB_SMO_BONUS_MULTIPLIER; + if (sm->sm_start <= mg->mg_bonus_area) + weight *= (metaslab_smo_bonus_pct / 100); ASSERT(weight >= space && - weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space); + weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); + + if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { + /* + * If this metaslab is one we're actively using, adjust its + * weight to make it preferable to any inactive metaslab so + * we'll polish it off. + */ + weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + } + return (weight); +} + +static void +metaslab_prefetch(metaslab_group_t *mg) +{ + spa_t *spa = mg->mg_vd->vdev_spa; + metaslab_t *msp; + avl_tree_t *t = &mg->mg_metaslab_tree; + int m; + + mutex_enter(&mg->mg_lock); /* - * If this metaslab is one we're actively using, adjust its weight to - * make it preferable to any inactive metaslab so we'll polish it off. + * Prefetch the next potential metaslabs */ - weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo; - return (weight); + /* If we have reached our prefetch limit then we're done */ + if (m >= metaslab_prefetch_limit) + break; + + if (!sm->sm_loaded && smo->smo_object != 0) { + mutex_exit(&mg->mg_lock); + dmu_prefetch(spa_meta_objset(spa), smo->smo_object, + 0ULL, smo->smo_objsize); + mutex_enter(&mg->mg_lock); + } + } + mutex_exit(&mg->mg_lock); } static int metaslab_activate(metaslab_t *msp, uint64_t activation_weight) { + metaslab_group_t *mg = msp->ms_group; space_map_t *sm = &msp->ms_map; + space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; + int t; ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int error = space_map_load(sm, &metaslab_ff_ops, - SM_FREE, &msp->ms_smo, - msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); - if (error) { - metaslab_group_sort(msp->ms_group, msp, 0); - return (error); + space_map_load_wait(sm); + if (!sm->sm_loaded) { + int error = space_map_load(sm, sm_ops, SM_FREE, + &msp->ms_smo, + spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); + if (error) { + metaslab_group_sort(msp->ms_group, msp, 0); + return (error); + } + for (t = 0; t < TXG_DEFER_SIZE; t++) + space_map_walk(&msp->ms_defermap[t], + space_map_claim, sm); + + } + + /* + * Track the bonus area as we activate new metaslabs. + */ + if (sm->sm_start > mg->mg_bonus_area) { + mutex_enter(&mg->mg_lock); + mg->mg_bonus_area = sm->sm_start; + mutex_exit(&mg->mg_lock); } + metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); } @@ -458,7 +951,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; + objset_t *mos = spa_meta_objset(spa); space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; @@ -468,7 +961,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) dmu_tx_t *tx; int t; - tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + ASSERT(!vd->vdev_ishole); + + if (allocmap->sm_space == 0 && freemap->sm_space == 0) + return; /* * The only state that can actually be changing concurrently with @@ -478,12 +974,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * We drop it whenever we call into the DMU, because the DMU * can call down to us (e.g. via zio_free()) at any time. */ - mutex_enter(&msp->ms_lock); + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); if (smo->smo_object == 0) { ASSERT(smo->smo_objsize == 0); ASSERT(smo->smo_alloc == 0); - mutex_exit(&msp->ms_lock); smo->smo_object = dmu_object_alloc(mos, DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); @@ -491,9 +987,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * (sm->sm_start >> vd->vdev_ms_shift), sizeof (uint64_t), &smo->smo_object, tx); - mutex_enter(&msp->ms_lock); } + mutex_enter(&msp->ms_lock); + space_map_walk(freemap, space_map_add, freed_map); if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= @@ -506,6 +1003,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * This metaslab is 100% allocated, * minus the content of the in-core map (sm), * minus what's been freed this txg (freed_map), + * minus deferred frees (ms_defermap[]), * minus allocations from txgs in the future * (because they haven't been committed yet). */ @@ -517,6 +1015,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_walk(sm, space_map_remove, allocmap); space_map_walk(freed_map, space_map_remove, allocmap); + for (t = 0; t < TXG_DEFER_SIZE; t++) + space_map_walk(&msp->ms_defermap[t], + space_map_remove, allocmap); + for (t = 1; t < TXG_CONCURRENT_STATES; t++) space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], space_map_remove, allocmap); @@ -551,10 +1053,14 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) space_map_obj_t *smosync = &msp->ms_smo_syncing; space_map_t *sm = &msp->ms_map; space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; + int64_t alloc_delta, defer_delta; int t; + ASSERT(!vd->vdev_ishole); + mutex_enter(&msp->ms_lock); /* @@ -568,10 +1074,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) space_map_create(&msp->ms_freemap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); } - vdev_space_update(vd, sm->sm_size, 0, B_TRUE); + + for (t = 0; t < TXG_DEFER_SIZE; t++) + space_map_create(&msp->ms_defermap[t], sm->sm_start, + sm->sm_size, sm->sm_shift, sm->sm_lock); + + vdev_space_update(vd, 0, 0, sm->sm_size); } - vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE); + alloc_delta = smosync->smo_alloc - smo->smo_alloc; + defer_delta = freed_map->sm_space - defer_map->sm_space; + + vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); @@ -579,13 +1093,26 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) /* * If there's a space_map_load() in progress, wait for it to complete * so that we have a consistent view of the in-core space map. - * Then, add everything we freed in this txg to the map. + * Then, add defer_map (oldest deferred frees) to this map and + * transfer freed_map (this txg's frees) to defer_map. */ space_map_load_wait(sm); - space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm); + space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); + space_map_vacate(freed_map, space_map_add, defer_map); *smo = *smosync; + msp->ms_deferspace += defer_delta; + ASSERT3S(msp->ms_deferspace, >=, 0); + ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); + if (msp->ms_deferspace != 0) { + /* + * Keep syncing this metaslab until all deferred frees + * are back in circulation. + */ + vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); + } + /* * If the map is loaded but no longer active, evict it as soon as all * future allocations have synced. (If we unloaded it now and then @@ -598,7 +1125,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) evictable = 0; - if (evictable) + if (evictable && !metaslab_debug) space_map_unload(sm); } @@ -607,6 +1134,36 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) mutex_exit(&msp->ms_lock); } +void +metaslab_sync_reassess(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + int64_t failures = mg->mg_alloc_failures; + int m; + + /* + * Re-evaluate all metaslabs which have lower offsets than the + * bonus area. + */ + for (m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_map.sm_start > mg->mg_bonus_area) + break; + + mutex_enter(&msp->ms_lock); + metaslab_group_sort(mg, msp, metaslab_weight(msp)); + mutex_exit(&msp->ms_lock); + } + + atomic_add_64(&mg->mg_alloc_failures, -failures); + + /* + * Prefetch the next potential metaslabs + */ + metaslab_prefetch(mg); +} + static uint64_t metaslab_distance(metaslab_t *msp, dva_t *dva) { @@ -625,9 +1182,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva) } static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, - uint64_t min_distance, dva_t *dva, int d) +metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, + uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) { + spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp = NULL; uint64_t offset = -1ULL; avl_tree_t *t = &mg->mg_metaslab_tree; @@ -636,18 +1194,30 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, int i; activation_weight = METASLAB_WEIGHT_PRIMARY; - for (i = 0; i < d; i++) - if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) + for (i = 0; i < d; i++) { + if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; + break; + } + } for (;;) { + boolean_t was_active; + mutex_enter(&mg->mg_lock); for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { - if (msp->ms_weight < size) { + if (msp->ms_weight < asize) { + spa_dbgmsg(spa, "%s: failed to meet weight " + "requirement: vdev %llu, txg %llu, mg %p, " + "msp %p, psize %llu, asize %llu, " + "failures %llu, weight %llu", + spa_name(spa), mg->mg_vd->vdev_id, txg, + mg, msp, psize, asize, + mg->mg_alloc_failures, msp->ms_weight); mutex_exit(&mg->mg_lock); return (-1ULL); } - + was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; if (activation_weight == METASLAB_WEIGHT_PRIMARY) break; @@ -665,6 +1235,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, if (msp == NULL) return (-1ULL); + /* + * If we've already reached the allowable number of failed + * allocation attempts on this metaslab group then we + * consider skipping it. We skip it only if we're allowed + * to "fast" gang, the physical size is larger than + * a gang block, and we're attempting to allocate from + * the primary metaslab. + */ + if (mg->mg_alloc_failures > zfs_mg_alloc_failures && + CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && + activation_weight == METASLAB_WEIGHT_PRIMARY) { + spa_dbgmsg(spa, "%s: skipping metaslab group: " + "vdev %llu, txg %llu, mg %p, psize %llu, " + "asize %llu, failures %llu", spa_name(spa), + mg->mg_vd->vdev_id, txg, mg, psize, asize, + mg->mg_alloc_failures); + return (-1ULL); + } + mutex_enter(&msp->ms_lock); /* @@ -673,7 +1262,9 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, * another thread may have changed the weight while we * were blocked on the metaslab lock. */ - if (msp->ms_weight < size) { + if (msp->ms_weight < asize || (was_active && + !(msp->ms_weight & METASLAB_ACTIVE_MASK) && + activation_weight == METASLAB_WEIGHT_PRIMARY)) { mutex_exit(&msp->ms_lock); continue; } @@ -691,10 +1282,12 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, continue; } - if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) + if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) break; - metaslab_passivate(msp, size - 1); + atomic_inc_64(&mg->mg_alloc_failures); + + metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); mutex_exit(&msp->ms_lock); } @@ -702,7 +1295,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); mutex_exit(&msp->ms_lock); @@ -716,7 +1309,7 @@ static int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) { - metaslab_group_t *mg, *rotor; + metaslab_group_t *mg, *fast_mg, *rotor; vdev_t *vd; int dshift = 3; int all_zero; @@ -731,12 +1324,15 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, /* * For testing, make some blocks above a certain size be gang blocks. */ - if (psize >= metaslab_gang_bang && (lbolt & 3) == 0) + if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) return (ENOSPC); + if (flags & METASLAB_FASTWRITE) + mutex_enter(&mc->mc_fastwrite_lock); + /* * Start at the rotor and loop through all mgs until we find something. - * Note that there's no locking on mc_rotor or mc_allocated because + * Note that there's no locking on mc_rotor or mc_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * @@ -758,27 +1354,50 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, */ if (hintdva) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); - if (flags & METASLAB_HINTBP_AVOID) - mg = vd->vdev_mg->mg_next; - else + + /* + * It's possible the vdev we're using as the hint no + * longer exists (i.e. removed). Consult the rotor when + * all else fails. + */ + if (vd != NULL) { mg = vd->vdev_mg; + + if (flags & METASLAB_HINTBP_AVOID && + mg->mg_next != NULL) + mg = mg->mg_next; + } else { + mg = mc->mc_rotor; + } } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; + } else if (flags & METASLAB_FASTWRITE) { + mg = fast_mg = mc->mc_rotor; + + do { + if (fast_mg->mg_vd->vdev_pending_fastwrite < + mg->mg_vd->vdev_pending_fastwrite) + mg = fast_mg; + } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); + } else { mg = mc->mc_rotor; } /* - * If the hint put us into the wrong class, just follow the rotor. + * If the hint put us into the wrong metaslab class, or into a + * metaslab group that has been passivated, just follow the rotor. */ - if (mg->mg_class != mc) + if (mg->mg_class != mc || mg->mg_activation_count <= 0) mg = mc->mc_rotor; rotor = mg; top: all_zero = B_TRUE; do { + ASSERT(mg->mg_activation_count == 1); + vd = mg->mg_vd; /* @@ -815,7 +1434,8 @@ top: asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); + offset = metaslab_group_alloc(mg, psize, asize, txg, distance, + dva, d, flags); if (offset != -1ULL) { /* * If we've just selected this metaslab group, @@ -823,32 +1443,35 @@ top: * over- or under-used relative to the pool, * and set an allocation bias to even it out. */ - if (mc->mc_allocated == 0) { + if (mc->mc_aliquot == 0) { vdev_stat_t *vs = &vd->vdev_stat; - uint64_t alloc, space; - int64_t vu, su; - - alloc = spa_get_alloc(spa); - space = spa_get_space(spa); + int64_t vu, cu; - /* - * Determine percent used in units of 0..1024. - * (This is just to avoid floating point.) - */ - vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); - su = (alloc << 10) / (space + 1); + vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); + cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); /* - * Bias by at most +/- 25% of the aliquot. + * Calculate how much more or less we should + * try to allocate from this device during + * this iteration around the rotor. + * For example, if a device is 80% full + * and the pool is 20% full then we should + * reduce allocations by 60% on this device. + * + * mg_bias = (20 - 80) * 512K / 100 = -307K + * + * This reduces allocations by 307K for this + * iteration. */ - mg->mg_bias = ((su - vu) * - (int64_t)mg->mg_aliquot) / (1024 * 4); + mg->mg_bias = ((cu - vu) * + (int64_t)mg->mg_aliquot) / 100; } - if (atomic_add_64_nv(&mc->mc_allocated, asize) >= + if ((flags & METASLAB_FASTWRITE) || + atomic_add_64_nv(&mc->mc_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mc->mc_rotor = mg->mg_next; - mc->mc_allocated = 0; + mc->mc_aliquot = 0; } DVA_SET_VDEV(&dva[d], vd->vdev_id); @@ -856,11 +1479,17 @@ top: DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); DVA_SET_ASIZE(&dva[d], asize); + if (flags & METASLAB_FASTWRITE) { + atomic_add_64(&vd->vdev_pending_fastwrite, + psize); + mutex_exit(&mc->mc_fastwrite_lock); + } + return (0); } next: mc->mc_rotor = mg->mg_next; - mc->mc_allocated = 0; + mc->mc_aliquot = 0; } while ((mg = mg->mg_next) != rotor); if (!all_zero) { @@ -869,7 +1498,7 @@ next: goto top; } - if (!zio_lock) { + if (!allocatable && !zio_lock) { dshift = 3; zio_lock = B_TRUE; goto top; @@ -877,6 +1506,8 @@ next: bzero(&dva[d], sizeof (dva_t)); + if (flags & METASLAB_FASTWRITE) + mutex_exit(&mc->mc_fastwrite_lock); return (ENOSPC); } @@ -940,7 +1571,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd; metaslab_t *msp; - int error; + int error = 0; ASSERT(DVA_IS_VALID(dva)); @@ -955,7 +1586,12 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + + if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) + error = ENOENT; + if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); @@ -980,9 +1616,10 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; - int error = 0; + int d, error = 0; ASSERT(bp->blk_birth == 0); + ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); @@ -995,7 +1632,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); - for (int d = 0; d < ndvas; d++) { + for (d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, txg, flags); if (error) { @@ -1012,7 +1649,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, spa_config_exit(spa, SCL_ALLOC, FTAG); - bp->blk_birth = txg; + BP_SET_BIRTH(bp, txg, txg); return (0); } @@ -1021,14 +1658,14 @@ void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); + int d, ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg); + ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); - for (int d = 0; d < ndvas; d++) + for (d = 0; d < ndvas; d++) metaslab_free_dva(spa, &dva[d], txg, now); spa_config_exit(spa, SCL_FREE, FTAG); @@ -1039,7 +1676,7 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); - int error = 0; + int d, error = 0; ASSERT(!BP_IS_HOLE(bp)); @@ -1054,7 +1691,7 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); - for (int d = 0; d < ndvas; d++) + for (d = 0; d < ndvas; d++) if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) break; @@ -1064,3 +1701,48 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) return (error); } + +void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + uint64_t psize = BP_GET_PSIZE(bp); + int d; + vdev_t *vd; + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(psize > 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (d = 0; d < ndvas; d++) { + if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) + continue; + atomic_add_64(&vd->vdev_pending_fastwrite, psize); + } + + spa_config_exit(spa, SCL_VDEV, FTAG); +} + +void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + uint64_t psize = BP_GET_PSIZE(bp); + int d; + vdev_t *vd; + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(psize > 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (d = 0; d < ndvas; d++) { + if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) + continue; + ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); + atomic_sub_64(&vd->vdev_pending_fastwrite, psize); + } + + spa_config_exit(spa, SCL_VDEV, FTAG); +}