X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fmetaslab.c;h=cd1b6ce730f4843bbe5d41f9f4b24f39f4aa48f5;hb=refs%2Fheads%2Frertzinger%2Ffeature-zpool-get--p;hp=77556ac5d71cd718ef918343a08851c4cdf85495;hpb=9babb37438b58e77bad04e820d5702e15b79e6a6;p=zfs.git diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 77556ac..cd1b6ce 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -19,12 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include -#include #include #include #include @@ -32,12 +31,46 @@ #include #include +#define WITH_DF_BLOCK_ALLOCATOR + +/* + * Allow allocations to switch to gang blocks quickly. We do this to + * avoid having to load lots of space_maps in a given txg. There are, + * however, some cases where we want to avoid "fast" ganging and instead + * we want to do an exhaustive search of all metaslabs on this device. + * Currently we don't allow any gang, zil, or dump device related allocations + * to "fast" gang. + */ +#define CAN_FASTGANG(flags) \ + (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ + METASLAB_GANG_AVOID))) + uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* + * The in-core space map representation is more compact than its on-disk form. + * The zfs_condense_pct determines how much more compact the in-core + * space_map representation must be before we compact it on-disk. + * Values should be greater than or equal to 100. + */ +int zfs_condense_pct = 200; + +/* + * This value defines the number of allowed allocation failures per vdev. + * If a device reaches this threshold in a given txg then we consider skipping + * allocations on that device. + */ +int zfs_mg_alloc_failures; + +/* + * Metaslab debugging: when set, keeps all space maps in core to verify frees. + */ +int metaslab_debug = 0; + +/* * Minimum size which forces the dynamic allocator to change - * it's allocation strategy. Once the space map cannot satisfy + * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ @@ -49,7 +82,23 @@ uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; * Once the space_map's free space drops below this level we dynamically * switch to using best-fit allocations. */ -int metaslab_df_free_pct = 30; +int metaslab_df_free_pct = 4; + +/* + * A metaslab is considered "free" if it contains a contiguous + * segment which is greater than metaslab_min_alloc_size. + */ +uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; + +/* + * Max number of space_maps to prefetch. + */ +int metaslab_prefetch_limit = SPA_DVAS_PER_BP; + +/* + * Percentage bonus multiplier for metaslabs that are in the bonus area. + */ +int metaslab_smo_bonus_pct = 150; /* * ========================================================================== @@ -57,14 +106,16 @@ int metaslab_df_free_pct = 30; * ========================================================================== */ metaslab_class_t * -metaslab_class_create(space_map_ops_t *ops) +metaslab_class_create(spa_t *spa, space_map_ops_t *ops) { metaslab_class_t *mc; - mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); + mc = kmem_zalloc(sizeof (metaslab_class_t), KM_PUSHPAGE); + mc->mc_spa = spa; mc->mc_rotor = NULL; mc->mc_ops = ops; + mutex_init(&mc->mc_fastwrite_lock, NULL, MUTEX_DEFAULT, NULL); return (mc); } @@ -72,58 +123,74 @@ metaslab_class_create(space_map_ops_t *ops) void metaslab_class_destroy(metaslab_class_t *mc) { - metaslab_group_t *mg; - - while ((mg = mc->mc_rotor) != NULL) { - metaslab_class_remove(mc, mg); - metaslab_group_destroy(mg); - } + ASSERT(mc->mc_rotor == NULL); + ASSERT(mc->mc_alloc == 0); + ASSERT(mc->mc_deferred == 0); + ASSERT(mc->mc_space == 0); + ASSERT(mc->mc_dspace == 0); + mutex_destroy(&mc->mc_fastwrite_lock); kmem_free(mc, sizeof (metaslab_class_t)); } -void -metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) +int +metaslab_class_validate(metaslab_class_t *mc) { - metaslab_group_t *mgprev, *mgnext; + metaslab_group_t *mg; + vdev_t *vd; - ASSERT(mg->mg_class == NULL); + /* + * Must hold one of the spa_config locks. + */ + ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || + spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); - if ((mgprev = mc->mc_rotor) == NULL) { - mg->mg_prev = mg; - mg->mg_next = mg; - } else { - mgnext = mgprev->mg_next; - mg->mg_prev = mgprev; - mg->mg_next = mgnext; - mgprev->mg_next = mg; - mgnext->mg_prev = mg; - } - mc->mc_rotor = mg; - mg->mg_class = mc; + if ((mg = mc->mc_rotor) == NULL) + return (0); + + do { + vd = mg->mg_vd; + ASSERT(vd->vdev_mg != NULL); + ASSERT3P(vd->vdev_top, ==, vd); + ASSERT3P(mg->mg_class, ==, mc); + ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); + } while ((mg = mg->mg_next) != mc->mc_rotor); + + return (0); } void -metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) +metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, + int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { - metaslab_group_t *mgprev, *mgnext; + atomic_add_64(&mc->mc_alloc, alloc_delta); + atomic_add_64(&mc->mc_deferred, defer_delta); + atomic_add_64(&mc->mc_space, space_delta); + atomic_add_64(&mc->mc_dspace, dspace_delta); +} - ASSERT(mg->mg_class == mc); +uint64_t +metaslab_class_get_alloc(metaslab_class_t *mc) +{ + return (mc->mc_alloc); +} - mgprev = mg->mg_prev; - mgnext = mg->mg_next; +uint64_t +metaslab_class_get_deferred(metaslab_class_t *mc) +{ + return (mc->mc_deferred); +} - if (mg == mgnext) { - mc->mc_rotor = NULL; - } else { - mc->mc_rotor = mgnext; - mgprev->mg_next = mgnext; - mgnext->mg_prev = mgprev; - } +uint64_t +metaslab_class_get_space(metaslab_class_t *mc) +{ + return (mc->mc_space); +} - mg->mg_prev = NULL; - mg->mg_next = NULL; - mg->mg_class = NULL; +uint64_t +metaslab_class_get_dspace(metaslab_class_t *mc) +{ + return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } /* @@ -145,9 +212,9 @@ metaslab_compare(const void *x1, const void *x2) /* * If the weights are identical, use the offset to force uniqueness. */ - if (m1->ms_map.sm_start < m2->ms_map.sm_start) + if (m1->ms_map->sm_start < m2->ms_map->sm_start) return (-1); - if (m1->ms_map.sm_start > m2->ms_map.sm_start) + if (m1->ms_map->sm_start > m2->ms_map->sm_start) return (1); ASSERT3P(m1, ==, m2); @@ -160,13 +227,13 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) { metaslab_group_t *mg; - mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); + mg = kmem_zalloc(sizeof (metaslab_group_t), KM_PUSHPAGE); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); - mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children); mg->mg_vd = vd; - metaslab_class_add(mc, mg); + mg->mg_class = mc; + mg->mg_activation_count = 0; return (mg); } @@ -174,11 +241,82 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) void metaslab_group_destroy(metaslab_group_t *mg) { + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + /* + * We may have gone below zero with the activation count + * either because we never activated in the first place or + * because we're done, and possibly removing the vdev. + */ + ASSERT(mg->mg_activation_count <= 0); + avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); kmem_free(mg, sizeof (metaslab_group_t)); } +void +metaslab_group_activate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + metaslab_group_t *mgprev, *mgnext; + + ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count <= 0); + + if (++mg->mg_activation_count <= 0) + return; + + mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); + + if ((mgprev = mc->mc_rotor) == NULL) { + mg->mg_prev = mg; + mg->mg_next = mg; + } else { + mgnext = mgprev->mg_next; + mg->mg_prev = mgprev; + mg->mg_next = mgnext; + mgprev->mg_next = mg; + mgnext->mg_prev = mg; + } + mc->mc_rotor = mg; +} + +void +metaslab_group_passivate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + metaslab_group_t *mgprev, *mgnext; + + ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + + if (--mg->mg_activation_count != 0) { + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count < 0); + return; + } + + mgprev = mg->mg_prev; + mgnext = mg->mg_next; + + if (mg == mgnext) { + mc->mc_rotor = NULL; + } else { + mc->mc_rotor = mgnext; + mgprev->mg_next = mgnext; + mgnext->mg_prev = mgprev; + } + + mg->mg_prev = NULL; + mg->mg_next = NULL; +} + static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { @@ -219,6 +357,35 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) } /* + * ========================================================================== + * Common allocator routines + * ========================================================================== + */ +static int +metaslab_segsize_compare(const void *x1, const void *x2) +{ + const space_seg_t *s1 = x1; + const space_seg_t *s2 = x2; + uint64_t ss_size1 = s1->ss_end - s1->ss_start; + uint64_t ss_size2 = s2->ss_end - s2->ss_start; + + if (ss_size1 < ss_size2) + return (-1); + if (ss_size1 > ss_size2) + return (1); + + if (s1->ss_start < s2->ss_start) + return (-1); + if (s1->ss_start > s2->ss_start) + return (1); + + return (0); +} + +#if defined(WITH_FF_BLOCK_ALLOCATOR) || \ + defined(WITH_DF_BLOCK_ALLOCATOR) || \ + defined(WITH_CDF_BLOCK_ALLOCATOR) +/* * This is a helper function that can be used by the allocator to find * a suitable block to allocate. This will search the specified AVL * tree looking for a block that matches the specified criteria. @@ -257,141 +424,249 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, *cursor = 0; return (metaslab_block_picker(t, cursor, size, align)); } +#endif /* WITH_FF/DF/CDF_BLOCK_ALLOCATOR */ -/* - * ========================================================================== - * The first-fit block allocator - * ========================================================================== - */ static void -metaslab_ff_load(space_map_t *sm) +metaslab_pp_load(space_map_t *sm) { + space_seg_t *ss; + ASSERT(sm->sm_ppd == NULL); - sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); - sm->sm_pp_root = NULL; + sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_PUSHPAGE); + + sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_PUSHPAGE); + avl_create(sm->sm_pp_root, metaslab_segsize_compare, + sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); + + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + avl_add(sm->sm_pp_root, ss); } static void -metaslab_ff_unload(space_map_t *sm) +metaslab_pp_unload(space_map_t *sm) { + void *cookie = NULL; + kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); sm->sm_ppd = NULL; -} -static uint64_t -metaslab_ff_alloc(space_map_t *sm, uint64_t size) -{ - avl_tree_t *t = &sm->sm_root; - uint64_t align = size & -size; - uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { + /* tear down the tree */ + } - return (metaslab_block_picker(t, cursor, size, align)); + avl_destroy(sm->sm_pp_root); + kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); + sm->sm_pp_root = NULL; } /* ARGSUSED */ static void -metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size) +metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) { /* No need to update cursor */ } /* ARGSUSED */ static void -metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size) +metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) { /* No need to update cursor */ } +/* + * Return the maximum contiguous segment within the metaslab. + */ +uint64_t +metaslab_pp_maxsize(space_map_t *sm) +{ + avl_tree_t *t = sm->sm_pp_root; + space_seg_t *ss; + + if (t == NULL || (ss = avl_last(t)) == NULL) + return (0ULL); + + return (ss->ss_end - ss->ss_start); +} + +#if defined(WITH_FF_BLOCK_ALLOCATOR) +/* + * ========================================================================== + * The first-fit block allocator + * ========================================================================== + */ +static uint64_t +metaslab_ff_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + + return (metaslab_block_picker(t, cursor, size, align)); +} + +/* ARGSUSED */ +boolean_t +metaslab_ff_fragmented(space_map_t *sm) +{ + return (B_TRUE); +} + static space_map_ops_t metaslab_ff_ops = { - metaslab_ff_load, - metaslab_ff_unload, + metaslab_pp_load, + metaslab_pp_unload, metaslab_ff_alloc, - metaslab_ff_claim, - metaslab_ff_free, - NULL /* maxsize */ + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_ff_fragmented }; +space_map_ops_t *zfs_metaslab_ops = &metaslab_ff_ops; +#endif /* WITH_FF_BLOCK_ALLOCATOR */ + +#if defined(WITH_DF_BLOCK_ALLOCATOR) /* + * ========================================================================== * Dynamic block allocator - * Uses the first fit allocation scheme until space get low and then * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold * and metaslab_df_free_pct to determine when to switch the allocation scheme. + * ========================================================================== */ - -uint64_t -metaslab_df_maxsize(space_map_t *sm) +static uint64_t +metaslab_df_alloc(space_map_t *sm, uint64_t size) { - avl_tree_t *t = sm->sm_pp_root; - space_seg_t *ss; + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + uint64_t max_size = metaslab_pp_maxsize(sm); + int free_pct = sm->sm_space * 100 / sm->sm_size; - if (t == NULL || (ss = avl_last(t)) == NULL) - return (0ULL); + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); - return (ss->ss_end - ss->ss_start); + if (max_size < size) + return (-1ULL); + + /* + * If we're running low on space switch to using the size + * sorted AVL tree (best-fit). + */ + if (max_size < metaslab_df_alloc_threshold || + free_pct < metaslab_df_free_pct) { + t = sm->sm_pp_root; + *cursor = 0; + } + + return (metaslab_block_picker(t, cursor, size, 1ULL)); } -static int -metaslab_df_seg_compare(const void *x1, const void *x2) +static boolean_t +metaslab_df_fragmented(space_map_t *sm) { - const space_seg_t *s1 = x1; - const space_seg_t *s2 = x2; - uint64_t ss_size1 = s1->ss_end - s1->ss_start; - uint64_t ss_size2 = s2->ss_end - s2->ss_start; - - if (ss_size1 < ss_size2) - return (-1); - if (ss_size1 > ss_size2) - return (1); + uint64_t max_size = metaslab_pp_maxsize(sm); + int free_pct = sm->sm_space * 100 / sm->sm_size; - if (s1->ss_start < s2->ss_start) - return (-1); - if (s1->ss_start > s2->ss_start) - return (1); + if (max_size >= metaslab_df_alloc_threshold && + free_pct >= metaslab_df_free_pct) + return (B_FALSE); - return (0); + return (B_TRUE); } -static void -metaslab_df_load(space_map_t *sm) +static space_map_ops_t metaslab_df_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_df_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_df_fragmented +}; + +space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; +#endif /* WITH_DF_BLOCK_ALLOCATOR */ + +/* + * ========================================================================== + * Other experimental allocators + * ========================================================================== + */ +#if defined(WITH_CDF_BLOCK_ALLOCATOR) +static uint64_t +metaslab_cdf_alloc(space_map_t *sm, uint64_t size) { - space_seg_t *ss; + avl_tree_t *t = &sm->sm_root; + uint64_t *cursor = (uint64_t *)sm->sm_ppd; + uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; + uint64_t max_size = metaslab_pp_maxsize(sm); + uint64_t rsize = size; + uint64_t offset = 0; - ASSERT(sm->sm_ppd == NULL); - sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); - sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); - avl_create(sm->sm_pp_root, metaslab_df_seg_compare, - sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); + if (max_size < size) + return (-1ULL); - for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) - avl_add(sm->sm_pp_root, ss); + ASSERT3U(*extent_end, >=, *cursor); + + /* + * If we're running low on space switch to using the size + * sorted AVL tree (best-fit). + */ + if ((*cursor + size) > *extent_end) { + + t = sm->sm_pp_root; + *cursor = *extent_end = 0; + + if (max_size > 2 * SPA_MAXBLOCKSIZE) + rsize = MIN(metaslab_min_alloc_size, max_size); + offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); + if (offset != -1) + *cursor = offset + size; + } else { + offset = metaslab_block_picker(t, cursor, rsize, 1ULL); + } + ASSERT3U(*cursor, <=, *extent_end); + return (offset); } -static void -metaslab_df_unload(space_map_t *sm) +static boolean_t +metaslab_cdf_fragmented(space_map_t *sm) { - void *cookie = NULL; + uint64_t max_size = metaslab_pp_maxsize(sm); - kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); - sm->sm_ppd = NULL; + if (max_size > (metaslab_min_alloc_size * 10)) + return (B_FALSE); + return (B_TRUE); +} - while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { - /* tear down the tree */ - } +static space_map_ops_t metaslab_cdf_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_cdf_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_cdf_fragmented +}; - avl_destroy(sm->sm_pp_root); - kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); - sm->sm_pp_root = NULL; -} +space_map_ops_t *zfs_metaslab_ops = &metaslab_cdf_ops; +#endif /* WITH_CDF_BLOCK_ALLOCATOR */ + +#if defined(WITH_NDF_BLOCK_ALLOCATOR) +uint64_t metaslab_ndf_clump_shift = 4; static uint64_t -metaslab_df_alloc(space_map_t *sm, uint64_t size) +metaslab_ndf_alloc(space_map_t *sm, uint64_t size) { avl_tree_t *t = &sm->sm_root; - uint64_t align = size & -size; - uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; - uint64_t max_size = metaslab_df_maxsize(sm); - int free_pct = sm->sm_space * 100 / sm->sm_size; + avl_index_t where; + space_seg_t *ss, ssearch; + uint64_t hbit = highbit(size); + uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; + uint64_t max_size = metaslab_pp_maxsize(sm); ASSERT(MUTEX_HELD(sm->sm_lock)); ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); @@ -399,43 +674,54 @@ metaslab_df_alloc(space_map_t *sm, uint64_t size) if (max_size < size) return (-1ULL); - /* - * If we're running low on space switch to using the size - * sorted AVL tree (best-fit). - */ - if (max_size < metaslab_df_alloc_threshold || - free_pct < metaslab_df_free_pct) { + ssearch.ss_start = *cursor; + ssearch.ss_end = *cursor + size; + + ss = avl_find(t, &ssearch, &where); + if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { t = sm->sm_pp_root; - *cursor = 0; + + ssearch.ss_start = 0; + ssearch.ss_end = MIN(max_size, + 1ULL << (hbit + metaslab_ndf_clump_shift)); + ss = avl_find(t, &ssearch, &where); + if (ss == NULL) + ss = avl_nearest(t, where, AVL_AFTER); + ASSERT(ss != NULL); } - return (metaslab_block_picker(t, cursor, size, 1ULL)); + if (ss != NULL) { + if (ss->ss_start + size <= ss->ss_end) { + *cursor = ss->ss_start + size; + return (ss->ss_start); + } + } + return (-1ULL); } -/* ARGSUSED */ -static void -metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size) +static boolean_t +metaslab_ndf_fragmented(space_map_t *sm) { - /* No need to update cursor */ -} + uint64_t max_size = metaslab_pp_maxsize(sm); -/* ARGSUSED */ -static void -metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size) -{ - /* No need to update cursor */ + if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) + return (B_FALSE); + return (B_TRUE); } -static space_map_ops_t metaslab_df_ops = { - metaslab_df_load, - metaslab_df_unload, - metaslab_df_alloc, - metaslab_df_claim, - metaslab_df_free, - metaslab_df_maxsize + +static space_map_ops_t metaslab_ndf_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_ndf_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_ndf_fragmented }; -space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; +space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; +#endif /* WITH_NDF_BLOCK_ALLOCATOR */ /* * ========================================================================== @@ -449,7 +735,7 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, vdev_t *vd = mg->mg_vd; metaslab_t *msp; - msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); + msp = kmem_zalloc(sizeof (metaslab_t), KM_PUSHPAGE); mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); msp->ms_smo_syncing = *smo; @@ -461,11 +747,19 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, * addition of new space; and for debugging, it ensures that we'd * data fault on any attempt to use this metaslab before it's ready. */ - space_map_create(&msp->ms_map, start, size, + msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_PUSHPAGE); + space_map_create(msp->ms_map, start, size, vd->vdev_ashift, &msp->ms_lock); metaslab_group_add(mg, msp); + if (metaslab_debug && smo->smo_object != 0) { + mutex_enter(&msp->ms_lock); + VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops, + SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); + mutex_exit(&msp->ms_lock); + } + /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. @@ -476,16 +770,8 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_sync_done(msp, 0); if (txg != 0) { - /* - * The vdev is dirty, but the metaslab isn't -- it just needs - * to have metaslab_sync_done() invoked from vdev_sync_done(). - * [We could just dirty the metaslab, but that would cause us - * to allocate a space map object for it, which is wasteful - * and would mess up the locality logic in metaslab_weight().] - */ - ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa)); vdev_dirty(vd, 0, NULL, txg); - vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg)); + vdev_dirty(vd, VDD_METASLAB, msp, txg); } return (msp); @@ -497,21 +783,31 @@ metaslab_fini(metaslab_t *msp) metaslab_group_t *mg = msp->ms_group; int t; - vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, - -msp->ms_smo.smo_alloc, B_TRUE); + vdev_space_update(mg->mg_vd, + -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size); metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); - space_map_unload(&msp->ms_map); - space_map_destroy(&msp->ms_map); + space_map_unload(msp->ms_map); + space_map_destroy(msp->ms_map); + kmem_free(msp->ms_map, sizeof (*msp->ms_map)); for (t = 0; t < TXG_SIZE; t++) { - space_map_destroy(&msp->ms_allocmap[t]); - space_map_destroy(&msp->ms_freemap[t]); + space_map_destroy(msp->ms_allocmap[t]); + space_map_destroy(msp->ms_freemap[t]); + kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t])); + kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t])); + } + + for (t = 0; t < TXG_DEFER_SIZE; t++) { + space_map_destroy(msp->ms_defermap[t]); + kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t])); } + ASSERT0(msp->ms_deferspace); + mutex_exit(&msp->ms_lock); mutex_destroy(&msp->ms_lock); @@ -522,13 +818,12 @@ metaslab_fini(metaslab_t *msp) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) #define METASLAB_ACTIVE_MASK \ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) -#define METASLAB_SMO_BONUS_MULTIPLIER 2 static uint64_t metaslab_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; - space_map_t *sm = &msp->ms_map; + space_map_t *sm = msp->ms_map; space_map_obj_t *smo = &msp->ms_smo; vdev_t *vd = mg->mg_vd; uint64_t weight, space; @@ -555,44 +850,91 @@ metaslab_weight(metaslab_t *msp) ASSERT(weight >= space && weight <= 2 * space); /* - * For locality, assign higher weight to metaslabs we've used before. + * For locality, assign higher weight to metaslabs which have + * a lower offset than what we've already activated. */ - if (smo->smo_object != 0) - weight *= METASLAB_SMO_BONUS_MULTIPLIER; + if (sm->sm_start <= mg->mg_bonus_area) + weight *= (metaslab_smo_bonus_pct / 100); ASSERT(weight >= space && - weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space); + weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); + + if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { + /* + * If this metaslab is one we're actively using, adjust its + * weight to make it preferable to any inactive metaslab so + * we'll polish it off. + */ + weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + } + return (weight); +} + +static void +metaslab_prefetch(metaslab_group_t *mg) +{ + spa_t *spa = mg->mg_vd->vdev_spa; + metaslab_t *msp; + avl_tree_t *t = &mg->mg_metaslab_tree; + int m; + + mutex_enter(&mg->mg_lock); /* - * If this metaslab is one we're actively using, adjust its weight to - * make it preferable to any inactive metaslab so we'll polish it off. + * Prefetch the next potential metaslabs */ - weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { + space_map_t *sm = msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo; - return (weight); + /* If we have reached our prefetch limit then we're done */ + if (m >= metaslab_prefetch_limit) + break; + + if (!sm->sm_loaded && smo->smo_object != 0) { + mutex_exit(&mg->mg_lock); + dmu_prefetch(spa_meta_objset(spa), smo->smo_object, + 0ULL, smo->smo_objsize); + mutex_enter(&mg->mg_lock); + } + } + mutex_exit(&mg->mg_lock); } static int -metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) +metaslab_activate(metaslab_t *msp, uint64_t activation_weight) { - space_map_t *sm = &msp->ms_map; + metaslab_group_t *mg = msp->ms_group; + space_map_t *sm = msp->ms_map; space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; + int t; ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo, - msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); - if (error) { - metaslab_group_sort(msp->ms_group, msp, 0); - return (error); + space_map_load_wait(sm); + if (!sm->sm_loaded) { + space_map_obj_t *smo = &msp->ms_smo; + + int error = space_map_load(sm, sm_ops, SM_FREE, smo, + spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); + if (error) { + metaslab_group_sort(msp->ms_group, msp, 0); + return (error); + } + for (t = 0; t < TXG_DEFER_SIZE; t++) + space_map_walk(msp->ms_defermap[t], + space_map_claim, sm); + } /* - * If we were able to load the map then make sure - * that this map is still able to satisfy our request. + * Track the bonus area as we activate new metaslabs. */ - if (msp->ms_weight < size) - return (ENOSPC); + if (sm->sm_start > mg->mg_bonus_area) { + mutex_enter(&mg->mg_lock); + mg->mg_bonus_area = sm->sm_start; + mutex_exit(&mg->mg_lock); + } metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); @@ -611,12 +953,159 @@ metaslab_passivate(metaslab_t *msp, uint64_t size) * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ - ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); + ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0); metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); } /* + * Determine if the in-core space map representation can be condensed on-disk. + * We would like to use the following criteria to make our decision: + * + * 1. The size of the space map object should not dramatically increase as a + * result of writing out our in-core free map. + * + * 2. The minimal on-disk space map representation is zfs_condense_pct/100 + * times the size than the in-core representation (i.e. zfs_condense_pct = 110 + * and in-core = 1MB, minimal = 1.1.MB). + * + * Checking the first condition is tricky since we don't want to walk + * the entire AVL tree calculating the estimated on-disk size. Instead we + * use the size-ordered AVL tree in the space map and calculate the + * size required for the largest segment in our in-core free map. If the + * size required to represent that segment on disk is larger than the space + * map object then we avoid condensing this map. + * + * To determine the second criterion we use a best-case estimate and assume + * each segment can be represented on-disk as a single 64-bit entry. We refer + * to this best-case estimate as the space map's minimal form. + */ +static boolean_t +metaslab_should_condense(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo_syncing; + space_seg_t *ss; + uint64_t size, entries, segsz; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(sm->sm_loaded); + + /* + * Use the sm_pp_root AVL tree, which is ordered by size, to obtain + * the largest segment in the in-core free map. If the tree is + * empty then we should condense the map. + */ + ss = avl_last(sm->sm_pp_root); + if (ss == NULL) + return (B_TRUE); + + /* + * Calculate the number of 64-bit entries this segment would + * require when written to disk. If this single segment would be + * larger on-disk than the entire current on-disk structure, then + * clearly condensing will increase the on-disk structure size. + */ + size = (ss->ss_end - ss->ss_start) >> sm->sm_shift; + entries = size / (MIN(size, SM_RUN_MAX)); + segsz = entries * sizeof (uint64_t); + + return (segsz <= smo->smo_objsize && + smo->smo_objsize >= (zfs_condense_pct * + sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100); +} + +/* + * Condense the on-disk space map representation to its minimized form. + * The minimized form consists of a small number of allocations followed by + * the in-core free map. + */ +static void +metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK]; + space_map_t condense_map; + space_map_t *sm = msp->ms_map; + objset_t *mos = spa_meta_objset(spa); + space_map_obj_t *smo = &msp->ms_smo_syncing; + int t; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(sm->sm_loaded); + + spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " + "smo size %llu, segments %lu", txg, + (msp->ms_map->sm_start / msp->ms_map->sm_size), msp, + smo->smo_objsize, avl_numnodes(&sm->sm_root)); + + /* + * Create an map that is a 100% allocated map. We remove segments + * that have been freed in this txg, any deferred frees that exist, + * and any allocation in the future. Removing segments should be + * a relatively inexpensive operation since we expect these maps to + * a small number of nodes. + */ + space_map_create(&condense_map, sm->sm_start, sm->sm_size, + sm->sm_shift, sm->sm_lock); + space_map_add(&condense_map, condense_map.sm_start, + condense_map.sm_size); + + /* + * Remove what's been freed in this txg from the condense_map. + * Since we're in sync_pass 1, we know that all the frees from + * this txg are in the freemap. + */ + space_map_walk(freemap, space_map_remove, &condense_map); + + for (t = 0; t < TXG_DEFER_SIZE; t++) + space_map_walk(msp->ms_defermap[t], + space_map_remove, &condense_map); + + for (t = 1; t < TXG_CONCURRENT_STATES; t++) + space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK], + space_map_remove, &condense_map); + + /* + * We're about to drop the metaslab's lock thus allowing + * other consumers to change it's content. Set the + * space_map's sm_condensing flag to ensure that + * allocations on this metaslab do not occur while we're + * in the middle of committing it to disk. This is only critical + * for the ms_map as all other space_maps use per txg + * views of their content. + */ + sm->sm_condensing = B_TRUE; + + mutex_exit(&msp->ms_lock); + space_map_truncate(smo, mos, tx); + mutex_enter(&msp->ms_lock); + + /* + * While we would ideally like to create a space_map representation + * that consists only of allocation records, doing so can be + * prohibitively expensive because the in-core free map can be + * large, and therefore computationally expensive to subtract + * from the condense_map. Instead we sync out two maps, a cheap + * allocation only map followed by the in-core free map. While not + * optimal, this is typically close to optimal, and much cheaper to + * compute. + */ + space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx); + space_map_vacate(&condense_map, NULL, NULL); + space_map_destroy(&condense_map); + + space_map_sync(sm, SM_FREE, smo, mos, tx); + sm->sm_condensing = B_FALSE; + + spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, " + "smo size %llu", txg, + (msp->ms_map->sm_start / msp->ms_map->sm_size), msp, + smo->smo_objsize); +} + +/* * Write a metaslab to disk in the context of the specified transaction group. */ void @@ -624,17 +1113,31 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; - space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; - space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; - space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - space_map_t *sm = &msp->ms_map; + objset_t *mos = spa_meta_objset(spa); + space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK]; + space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK]; + space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *sm = msp->ms_map; space_map_obj_t *smo = &msp->ms_smo_syncing; dmu_buf_t *db; dmu_tx_t *tx; - int t; - tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + ASSERT(!vd->vdev_ishole); + + /* + * This metaslab has just been added so there's no work to do now. + */ + if (*freemap == NULL) { + ASSERT3P(allocmap, ==, NULL); + return; + } + + ASSERT3P(allocmap, !=, NULL); + ASSERT3P(*freemap, !=, NULL); + ASSERT3P(*freed_map, !=, NULL); + + if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0) + return; /* * The only state that can actually be changing concurrently with @@ -644,12 +1147,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * We drop it whenever we call into the DMU, because the DMU * can call down to us (e.g. via zio_free()) at any time. */ - mutex_enter(&msp->ms_lock); + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); if (smo->smo_object == 0) { ASSERT(smo->smo_objsize == 0); ASSERT(smo->smo_alloc == 0); - mutex_exit(&msp->ms_lock); smo->smo_object = dmu_object_alloc(mos, DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); @@ -657,47 +1160,40 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * (sm->sm_start >> vd->vdev_ms_shift), sizeof (uint64_t), &smo->smo_object, tx); - mutex_enter(&msp->ms_lock); } - space_map_walk(freemap, space_map_add, freed_map); - - if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= - 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { - /* - * The in-core space map representation is twice as compact - * as the on-disk one, so it's time to condense the latter - * by generating a pure allocmap from first principles. - * - * This metaslab is 100% allocated, - * minus the content of the in-core map (sm), - * minus what's been freed this txg (freed_map), - * minus allocations from txgs in the future - * (because they haven't been committed yet). - */ - space_map_vacate(allocmap, NULL, NULL); - space_map_vacate(freemap, NULL, NULL); - - space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); + mutex_enter(&msp->ms_lock); - space_map_walk(sm, space_map_remove, allocmap); - space_map_walk(freed_map, space_map_remove, allocmap); + if (sm->sm_loaded && spa_sync_pass(spa) == 1 && + metaslab_should_condense(msp)) { + metaslab_condense(msp, txg, tx); + } else { + space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); + space_map_sync(*freemap, SM_FREE, smo, mos, tx); + } - for (t = 1; t < TXG_CONCURRENT_STATES; t++) - space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], - space_map_remove, allocmap); + space_map_vacate(allocmap, NULL, NULL); - mutex_exit(&msp->ms_lock); - space_map_truncate(smo, mos, tx); - mutex_enter(&msp->ms_lock); + /* + * For sync pass 1, we avoid walking the entire space map and + * instead will just swap the pointers for freemap and + * freed_map. We can safely do this since the freed_map is + * guaranteed to be empty on the initial pass. + */ + if (spa_sync_pass(spa) == 1) { + ASSERT0((*freed_map)->sm_space); + ASSERT0(avl_numnodes(&(*freed_map)->sm_root)); + space_map_swap(freemap, freed_map); + } else { + space_map_vacate(*freemap, space_map_add, *freed_map); } - space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); - space_map_sync(freemap, SM_FREE, smo, mos, tx); + ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space); + ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space); mutex_exit(&msp->ms_lock); - VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, sizeof (*smo)); bcopy(smo, db->db_data, sizeof (*smo)); @@ -715,43 +1211,79 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) { space_map_obj_t *smo = &msp->ms_smo; space_map_obj_t *smosync = &msp->ms_smo_syncing; - space_map_t *sm = &msp->ms_map; - space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *sm = msp->ms_map; + space_map_t *freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; + int64_t alloc_delta, defer_delta; int t; + ASSERT(!vd->vdev_ishole); + mutex_enter(&msp->ms_lock); /* * If this metaslab is just becoming available, initialize its - * allocmaps and freemaps and add its capacity to the vdev. + * allocmaps, freemaps, and defermap and add its capacity to the vdev. */ - if (freed_map->sm_size == 0) { + if (freed_map == NULL) { + ASSERT(defer_map == NULL); for (t = 0; t < TXG_SIZE; t++) { - space_map_create(&msp->ms_allocmap[t], sm->sm_start, + msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t), + KM_PUSHPAGE); + space_map_create(msp->ms_allocmap[t], sm->sm_start, + sm->sm_size, sm->sm_shift, sm->sm_lock); + msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t), + KM_PUSHPAGE); + space_map_create(msp->ms_freemap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); - space_map_create(&msp->ms_freemap[t], sm->sm_start, + } + + for (t = 0; t < TXG_DEFER_SIZE; t++) { + msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t), + KM_PUSHPAGE); + space_map_create(msp->ms_defermap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); } - vdev_space_update(vd, sm->sm_size, 0, B_TRUE); + + freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; + + vdev_space_update(vd, 0, 0, sm->sm_size); } - vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE); + alloc_delta = smosync->smo_alloc - smo->smo_alloc; + defer_delta = freed_map->sm_space - defer_map->sm_space; - ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); - ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); + vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); + + ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0); + ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0); /* * If there's a space_map_load() in progress, wait for it to complete * so that we have a consistent view of the in-core space map. - * Then, add everything we freed in this txg to the map. + * Then, add defer_map (oldest deferred frees) to this map and + * transfer freed_map (this txg's frees) to defer_map. */ space_map_load_wait(sm); - space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm); + space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); + space_map_vacate(freed_map, space_map_add, defer_map); *smo = *smosync; + msp->ms_deferspace += defer_delta; + ASSERT3S(msp->ms_deferspace, >=, 0); + ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); + if (msp->ms_deferspace != 0) { + /* + * Keep syncing this metaslab until all deferred frees + * are back in circulation. + */ + vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); + } + /* * If the map is loaded but no longer active, evict it as soon as all * future allocations have synced. (If we unloaded it now and then @@ -761,10 +1293,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) int evictable = 1; for (t = 1; t < TXG_CONCURRENT_STATES; t++) - if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) + if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space) evictable = 0; - if (evictable) + if (evictable && !metaslab_debug) space_map_unload(sm); } @@ -773,12 +1305,42 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) mutex_exit(&msp->ms_lock); } +void +metaslab_sync_reassess(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + int64_t failures = mg->mg_alloc_failures; + int m; + + /* + * Re-evaluate all metaslabs which have lower offsets than the + * bonus area. + */ + for (m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_map->sm_start > mg->mg_bonus_area) + break; + + mutex_enter(&msp->ms_lock); + metaslab_group_sort(mg, msp, metaslab_weight(msp)); + mutex_exit(&msp->ms_lock); + } + + atomic_add_64(&mg->mg_alloc_failures, -failures); + + /* + * Prefetch the next potential metaslabs + */ + metaslab_prefetch(mg); +} + static uint64_t metaslab_distance(metaslab_t *msp, dva_t *dva) { uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; - uint64_t start = msp->ms_map.sm_start >> ms_shift; + uint64_t start = msp->ms_map->sm_start >> ms_shift; if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) return (1ULL << 63); @@ -791,9 +1353,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva) } static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, - uint64_t min_distance, dva_t *dva, int d) +metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, + uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) { + spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp = NULL; uint64_t offset = -1ULL; avl_tree_t *t = &mg->mg_metaslab_tree; @@ -814,11 +1377,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, mutex_enter(&mg->mg_lock); for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { - if (msp->ms_weight < size) { + if (msp->ms_weight < asize) { + spa_dbgmsg(spa, "%s: failed to meet weight " + "requirement: vdev %llu, txg %llu, mg %p, " + "msp %p, psize %llu, asize %llu, " + "failures %llu, weight %llu", + spa_name(spa), mg->mg_vd->vdev_id, txg, + mg, msp, psize, asize, + mg->mg_alloc_failures, msp->ms_weight); mutex_exit(&mg->mg_lock); return (-1ULL); } - was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; if (activation_weight == METASLAB_WEIGHT_PRIMARY) break; @@ -837,15 +1406,44 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, if (msp == NULL) return (-1ULL); + /* + * If we've already reached the allowable number of failed + * allocation attempts on this metaslab group then we + * consider skipping it. We skip it only if we're allowed + * to "fast" gang, the physical size is larger than + * a gang block, and we're attempting to allocate from + * the primary metaslab. + */ + if (mg->mg_alloc_failures > zfs_mg_alloc_failures && + CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && + activation_weight == METASLAB_WEIGHT_PRIMARY) { + spa_dbgmsg(spa, "%s: skipping metaslab group: " + "vdev %llu, txg %llu, mg %p, psize %llu, " + "asize %llu, failures %llu", spa_name(spa), + mg->mg_vd->vdev_id, txg, mg, psize, asize, + mg->mg_alloc_failures); + return (-1ULL); + } + mutex_enter(&msp->ms_lock); /* + * If this metaslab is currently condensing then pick again as + * we can't manipulate this metaslab until it's committed + * to disk. + */ + if (msp->ms_map->sm_condensing) { + mutex_exit(&msp->ms_lock); + continue; + } + + /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we * were blocked on the metaslab lock. */ - if (msp->ms_weight < size || (was_active && + if (msp->ms_weight < asize || (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK) && activation_weight == METASLAB_WEIGHT_PRIMARY)) { mutex_exit(&msp->ms_lock); @@ -860,23 +1458,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, continue; } - if (metaslab_activate(msp, activation_weight, size) != 0) { + if (metaslab_activate(msp, activation_weight) != 0) { mutex_exit(&msp->ms_lock); continue; } - if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) + if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL) break; - metaslab_passivate(msp, size - 1); + atomic_inc_64(&mg->mg_alloc_failures); + + metaslab_passivate(msp, space_map_maxsize(msp->ms_map)); mutex_exit(&msp->ms_lock); } - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize); mutex_exit(&msp->ms_lock); @@ -890,7 +1490,7 @@ static int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) { - metaslab_group_t *mg, *rotor; + metaslab_group_t *mg, *fast_mg, *rotor; vdev_t *vd; int dshift = 3; int all_zero; @@ -905,12 +1505,15 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, /* * For testing, make some blocks above a certain size be gang blocks. */ - if (psize >= metaslab_gang_bang && (lbolt & 3) == 0) + if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) return (ENOSPC); + if (flags & METASLAB_FASTWRITE) + mutex_enter(&mc->mc_fastwrite_lock); + /* * Start at the rotor and loop through all mgs until we find something. - * Note that there's no locking on mc_rotor or mc_allocated because + * Note that there's no locking on mc_rotor or mc_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * @@ -932,27 +1535,50 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, */ if (hintdva) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); - if (flags & METASLAB_HINTBP_AVOID) - mg = vd->vdev_mg->mg_next; - else + + /* + * It's possible the vdev we're using as the hint no + * longer exists (i.e. removed). Consult the rotor when + * all else fails. + */ + if (vd != NULL) { mg = vd->vdev_mg; + + if (flags & METASLAB_HINTBP_AVOID && + mg->mg_next != NULL) + mg = mg->mg_next; + } else { + mg = mc->mc_rotor; + } } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; + } else if (flags & METASLAB_FASTWRITE) { + mg = fast_mg = mc->mc_rotor; + + do { + if (fast_mg->mg_vd->vdev_pending_fastwrite < + mg->mg_vd->vdev_pending_fastwrite) + mg = fast_mg; + } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); + } else { mg = mc->mc_rotor; } /* - * If the hint put us into the wrong class, just follow the rotor. + * If the hint put us into the wrong metaslab class, or into a + * metaslab group that has been passivated, just follow the rotor. */ - if (mg->mg_class != mc) + if (mg->mg_class != mc || mg->mg_activation_count <= 0) mg = mc->mc_rotor; rotor = mg; top: all_zero = B_TRUE; do { + ASSERT(mg->mg_activation_count == 1); + vd = mg->mg_vd; /* @@ -989,7 +1615,8 @@ top: asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); + offset = metaslab_group_alloc(mg, psize, asize, txg, distance, + dva, d, flags); if (offset != -1ULL) { /* * If we've just selected this metaslab group, @@ -997,32 +1624,35 @@ top: * over- or under-used relative to the pool, * and set an allocation bias to even it out. */ - if (mc->mc_allocated == 0) { + if (mc->mc_aliquot == 0) { vdev_stat_t *vs = &vd->vdev_stat; - uint64_t alloc, space; - int64_t vu, su; + int64_t vu, cu; - alloc = spa_get_alloc(spa); - space = spa_get_space(spa); - - /* - * Determine percent used in units of 0..1024. - * (This is just to avoid floating point.) - */ - vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); - su = (alloc << 10) / (space + 1); + vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); + cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); /* - * Bias by at most +/- 25% of the aliquot. + * Calculate how much more or less we should + * try to allocate from this device during + * this iteration around the rotor. + * For example, if a device is 80% full + * and the pool is 20% full then we should + * reduce allocations by 60% on this device. + * + * mg_bias = (20 - 80) * 512K / 100 = -307K + * + * This reduces allocations by 307K for this + * iteration. */ - mg->mg_bias = ((su - vu) * - (int64_t)mg->mg_aliquot) / (1024 * 4); + mg->mg_bias = ((cu - vu) * + (int64_t)mg->mg_aliquot) / 100; } - if (atomic_add_64_nv(&mc->mc_allocated, asize) >= + if ((flags & METASLAB_FASTWRITE) || + atomic_add_64_nv(&mc->mc_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mc->mc_rotor = mg->mg_next; - mc->mc_allocated = 0; + mc->mc_aliquot = 0; } DVA_SET_VDEV(&dva[d], vd->vdev_id); @@ -1030,11 +1660,17 @@ top: DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); DVA_SET_ASIZE(&dva[d], asize); + if (flags & METASLAB_FASTWRITE) { + atomic_add_64(&vd->vdev_pending_fastwrite, + psize); + mutex_exit(&mc->mc_fastwrite_lock); + } + return (0); } next: mc->mc_rotor = mg->mg_next; - mc->mc_allocated = 0; + mc->mc_aliquot = 0; } while ((mg = mg->mg_next) != rotor); if (!all_zero) { @@ -1051,6 +1687,8 @@ next: bzero(&dva[d], sizeof (dva_t)); + if (flags & METASLAB_FASTWRITE) + mutex_exit(&mc->mc_fastwrite_lock); return (ENOSPC); } @@ -1088,13 +1726,13 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) mutex_enter(&msp->ms_lock); if (now) { - space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], + space_map_remove(msp->ms_allocmap[txg & TXG_MASK], offset, size); - space_map_free(&msp->ms_map, offset, size); + space_map_free(msp->ms_map, offset, size); } else { - if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) + if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); + space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size); } mutex_exit(&msp->ms_lock); @@ -1114,7 +1752,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd; metaslab_t *msp; - int error; + int error = 0; ASSERT(DVA_IS_VALID(dva)); @@ -1129,18 +1767,23 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); + if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded) + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + + if (error == 0 && !space_map_contains(msp->ms_map, offset, size)) + error = ENOENT; + if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); } - space_map_claim(&msp->ms_map, offset, size); + space_map_claim(msp->ms_map, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size); } mutex_exit(&msp->ms_lock); @@ -1154,9 +1797,10 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; - int error = 0; + int d, error = 0; ASSERT(bp->blk_birth == 0); + ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); @@ -1169,7 +1813,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); - for (int d = 0; d < ndvas; d++) { + for (d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, txg, flags); if (error) { @@ -1186,7 +1830,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, spa_config_exit(spa, SCL_ALLOC, FTAG); - bp->blk_birth = txg; + BP_SET_BIRTH(bp, txg, txg); return (0); } @@ -1195,14 +1839,14 @@ void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); + int d, ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg); + ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); - for (int d = 0; d < ndvas; d++) + for (d = 0; d < ndvas; d++) metaslab_free_dva(spa, &dva[d], txg, now); spa_config_exit(spa, SCL_FREE, FTAG); @@ -1213,7 +1857,7 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); - int error = 0; + int d, error = 0; ASSERT(!BP_IS_HOLE(bp)); @@ -1228,7 +1872,7 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); - for (int d = 0; d < ndvas; d++) + for (d = 0; d < ndvas; d++) if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) break; @@ -1238,3 +1882,53 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) return (error); } + +void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + uint64_t psize = BP_GET_PSIZE(bp); + int d; + vdev_t *vd; + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(psize > 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (d = 0; d < ndvas; d++) { + if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) + continue; + atomic_add_64(&vd->vdev_pending_fastwrite, psize); + } + + spa_config_exit(spa, SCL_VDEV, FTAG); +} + +void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + uint64_t psize = BP_GET_PSIZE(bp); + int d; + vdev_t *vd; + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(psize > 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (d = 0; d < ndvas; d++) { + if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) + continue; + ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); + atomic_sub_64(&vd->vdev_pending_fastwrite, psize); + } + + spa_config_exit(spa, SCL_VDEV, FTAG); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(metaslab_debug, int, 0644); +MODULE_PARM_DESC(metaslab_debug, "keep space maps in core to verify frees"); +#endif /* _KERNEL && HAVE_SPL */