X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fmetaslab.c;h=b089f1eac4cf756b28dfeb326f7312053ea28ecb;hb=30930fba219642cb1dadf1c8ef60ff799e3dc424;hp=17b4b12c4ee4db8f4aa0b5af2e915e9d459294c2;hpb=428870ff734fdaccc342b33fc53cf94724409a46;p=zfs.git diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 17b4b12..b089f1e 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include @@ -30,10 +31,31 @@ #include #include +#define WITH_DF_BLOCK_ALLOCATOR + +/* + * Allow allocations to switch to gang blocks quickly. We do this to + * avoid having to load lots of space_maps in a given txg. There are, + * however, some cases where we want to avoid "fast" ganging and instead + * we want to do an exhaustive search of all metaslabs on this device. + * Currently we don't allow any gang or dump device related allocations + * to "fast" gang. + */ +#define CAN_FASTGANG(flags) \ + (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ + METASLAB_GANG_AVOID))) + uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* + * This value defines the number of allowed allocation failures per vdev. + * If a device reaches this threshold in a given txg then we consider skipping + * allocations on that device. + */ +int zfs_mg_alloc_failures; + +/* * Metaslab debugging: when set, keeps all space maps in core to verify frees. */ static int metaslab_debug = 0; @@ -350,6 +372,9 @@ metaslab_segsize_compare(const void *x1, const void *x2) return (0); } +#if defined(WITH_FF_BLOCK_ALLOCATOR) || \ + defined(WITH_DF_BLOCK_ALLOCATOR) || \ + defined(WITH_CDF_BLOCK_ALLOCATOR) /* * This is a helper function that can be used by the allocator to find * a suitable block to allocate. This will search the specified AVL @@ -389,6 +414,7 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, *cursor = 0; return (metaslab_block_picker(t, cursor, size, align)); } +#endif /* WITH_FF/DF/CDF_BLOCK_ALLOCATOR */ static void metaslab_pp_load(space_map_t *sm) @@ -452,6 +478,7 @@ metaslab_pp_maxsize(space_map_t *sm) return (ss->ss_end - ss->ss_start); } +#if defined(WITH_FF_BLOCK_ALLOCATOR) /* * ========================================================================== * The first-fit block allocator @@ -484,6 +511,10 @@ static space_map_ops_t metaslab_ff_ops = { metaslab_ff_fragmented }; +space_map_ops_t *zfs_metaslab_ops = &metaslab_ff_ops; +#endif /* WITH_FF_BLOCK_ALLOCATOR */ + +#if defined(WITH_DF_BLOCK_ALLOCATOR) /* * ========================================================================== * Dynamic block allocator - @@ -543,11 +574,15 @@ static space_map_ops_t metaslab_df_ops = { metaslab_df_fragmented }; +space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; +#endif /* WITH_DF_BLOCK_ALLOCATOR */ + /* * ========================================================================== * Other experimental allocators * ========================================================================== */ +#if defined(WITH_CDF_BLOCK_ALLOCATOR) static uint64_t metaslab_cdf_alloc(space_map_t *sm, uint64_t size) { @@ -607,6 +642,10 @@ static space_map_ops_t metaslab_cdf_ops = { metaslab_cdf_fragmented }; +space_map_ops_t *zfs_metaslab_ops = &metaslab_cdf_ops; +#endif /* WITH_CDF_BLOCK_ALLOCATOR */ + +#if defined(WITH_NDF_BLOCK_ALLOCATOR) uint64_t metaslab_ndf_clump_shift = 4; static uint64_t @@ -672,6 +711,7 @@ static space_map_ops_t metaslab_ndf_ops = { }; space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; +#endif /* WITH_NDF_BLOCK_ALLOCATOR */ /* * ========================================================================== @@ -730,6 +770,7 @@ void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; + int t; vdev_space_update(mg->mg_vd, -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); @@ -741,12 +782,12 @@ metaslab_fini(metaslab_t *msp) space_map_unload(&msp->ms_map); space_map_destroy(&msp->ms_map); - for (int t = 0; t < TXG_SIZE; t++) { + for (t = 0; t < TXG_SIZE; t++) { space_map_destroy(&msp->ms_allocmap[t]); space_map_destroy(&msp->ms_freemap[t]); } - for (int t = 0; t < TXG_DEFER_SIZE; t++) + for (t = 0; t < TXG_DEFER_SIZE; t++) space_map_destroy(&msp->ms_defermap[t]); ASSERT3S(msp->ms_deferspace, ==, 0); @@ -844,11 +885,12 @@ metaslab_prefetch(metaslab_group_t *mg) } static int -metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) +metaslab_activate(metaslab_t *msp, uint64_t activation_weight) { metaslab_group_t *mg = msp->ms_group; space_map_t *sm = &msp->ms_map; space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; + int t; ASSERT(MUTEX_HELD(&msp->ms_lock)); @@ -862,7 +904,7 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) metaslab_group_sort(msp->ms_group, msp, 0); return (error); } - for (int t = 0; t < TXG_DEFER_SIZE; t++) + for (t = 0; t < TXG_DEFER_SIZE; t++) space_map_walk(&msp->ms_defermap[t], space_map_claim, sm); @@ -877,13 +919,6 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) mutex_exit(&mg->mg_lock); } - /* - * If we were able to load the map then make sure - * that this map is still able to satisfy our request. - */ - if (msp->ms_weight < size) - return (ENOSPC); - metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); } @@ -922,6 +957,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_obj_t *smo = &msp->ms_smo_syncing; dmu_buf_t *db; dmu_tx_t *tx; + int t; ASSERT(!vd->vdev_ishole); @@ -977,11 +1013,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_walk(sm, space_map_remove, allocmap); space_map_walk(freed_map, space_map_remove, allocmap); - for (int t = 0; t < TXG_DEFER_SIZE; t++) + for (t = 0; t < TXG_DEFER_SIZE; t++) space_map_walk(&msp->ms_defermap[t], space_map_remove, allocmap); - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) + for (t = 1; t < TXG_CONCURRENT_STATES; t++) space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], space_map_remove, allocmap); @@ -1019,6 +1055,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; int64_t alloc_delta, defer_delta; + int t; ASSERT(!vd->vdev_ishole); @@ -1029,14 +1066,14 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * allocmaps and freemaps and add its capacity to the vdev. */ if (freed_map->sm_size == 0) { - for (int t = 0; t < TXG_SIZE; t++) { + for (t = 0; t < TXG_SIZE; t++) { space_map_create(&msp->ms_allocmap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); space_map_create(&msp->ms_freemap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); } - for (int t = 0; t < TXG_DEFER_SIZE; t++) + for (t = 0; t < TXG_DEFER_SIZE; t++) space_map_create(&msp->ms_defermap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); @@ -1082,7 +1119,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { int evictable = 1; - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) + for (t = 1; t < TXG_CONCURRENT_STATES; t++) if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) evictable = 0; @@ -1099,12 +1136,14 @@ void metaslab_sync_reassess(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; + int64_t failures = mg->mg_alloc_failures; + int m; /* * Re-evaluate all metaslabs which have lower offsets than the * bonus area. */ - for (int m = 0; m < vd->vdev_ms_count; m++) { + for (m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp->ms_map.sm_start > mg->mg_bonus_area) @@ -1115,6 +1154,8 @@ metaslab_sync_reassess(metaslab_group_t *mg) mutex_exit(&msp->ms_lock); } + atomic_add_64(&mg->mg_alloc_failures, -failures); + /* * Prefetch the next potential metaslabs */ @@ -1139,9 +1180,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva) } static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, - uint64_t min_distance, dva_t *dva, int d) +metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, + uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) { + spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp = NULL; uint64_t offset = -1ULL; avl_tree_t *t = &mg->mg_metaslab_tree; @@ -1162,11 +1204,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, mutex_enter(&mg->mg_lock); for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { - if (msp->ms_weight < size) { + if (msp->ms_weight < asize) { + spa_dbgmsg(spa, "%s: failed to meet weight " + "requirement: vdev %llu, txg %llu, mg %p, " + "msp %p, psize %llu, asize %llu, " + "failures %llu, weight %llu", + spa_name(spa), mg->mg_vd->vdev_id, txg, + mg, msp, psize, asize, + mg->mg_alloc_failures, msp->ms_weight); mutex_exit(&mg->mg_lock); return (-1ULL); } - was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; if (activation_weight == METASLAB_WEIGHT_PRIMARY) break; @@ -1185,6 +1233,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, if (msp == NULL) return (-1ULL); + /* + * If we've already reached the allowable number of failed + * allocation attempts on this metaslab group then we + * consider skipping it. We skip it only if we're allowed + * to "fast" gang, the physical size is larger than + * a gang block, and we're attempting to allocate from + * the primary metaslab. + */ + if (mg->mg_alloc_failures > zfs_mg_alloc_failures && + CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && + activation_weight == METASLAB_WEIGHT_PRIMARY) { + spa_dbgmsg(spa, "%s: skipping metaslab group: " + "vdev %llu, txg %llu, mg %p, psize %llu, " + "asize %llu, failures %llu", spa_name(spa), + mg->mg_vd->vdev_id, txg, mg, psize, asize, + mg->mg_alloc_failures); + return (-1ULL); + } + mutex_enter(&msp->ms_lock); /* @@ -1193,7 +1260,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, * another thread may have changed the weight while we * were blocked on the metaslab lock. */ - if (msp->ms_weight < size || (was_active && + if (msp->ms_weight < asize || (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK) && activation_weight == METASLAB_WEIGHT_PRIMARY)) { mutex_exit(&msp->ms_lock); @@ -1208,14 +1275,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, continue; } - if (metaslab_activate(msp, activation_weight, size) != 0) { + if (metaslab_activate(msp, activation_weight) != 0) { mutex_exit(&msp->ms_lock); continue; } - if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) + if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) break; + atomic_inc_64(&mg->mg_alloc_failures); + metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); mutex_exit(&msp->ms_lock); @@ -1224,7 +1293,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); mutex_exit(&msp->ms_lock); @@ -1351,7 +1420,8 @@ top: asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); + offset = metaslab_group_alloc(mg, psize, asize, txg, distance, + dva, d, flags); if (offset != -1ULL) { /* * If we've just selected this metaslab group, @@ -1363,18 +1433,24 @@ top: vdev_stat_t *vs = &vd->vdev_stat; int64_t vu, cu; - /* - * Determine percent used in units of 0..1024. - * (This is just to avoid floating point.) - */ - vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); - cu = (mc->mc_alloc << 10) / (mc->mc_space + 1); + vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); + cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); /* - * Bias by at most +/- 25% of the aliquot. + * Calculate how much more or less we should + * try to allocate from this device during + * this iteration around the rotor. + * For example, if a device is 80% full + * and the pool is 20% full then we should + * reduce allocations by 60% on this device. + * + * mg_bias = (20 - 80) * 512K / 100 = -307K + * + * This reduces allocations by 307K for this + * iteration. */ mg->mg_bias = ((cu - vu) * - (int64_t)mg->mg_aliquot) / (1024 * 4); + (int64_t)mg->mg_aliquot) / 100; } if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= @@ -1488,7 +1564,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) error = ENOENT; @@ -1517,7 +1593,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; - int error = 0; + int d, error = 0; ASSERT(bp->blk_birth == 0); ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); @@ -1533,7 +1609,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); - for (int d = 0; d < ndvas; d++) { + for (d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, txg, flags); if (error) { @@ -1559,14 +1635,14 @@ void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); + int d, ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); - for (int d = 0; d < ndvas; d++) + for (d = 0; d < ndvas; d++) metaslab_free_dva(spa, &dva[d], txg, now); spa_config_exit(spa, SCL_FREE, FTAG); @@ -1577,7 +1653,7 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); - int error = 0; + int d, error = 0; ASSERT(!BP_IS_HOLE(bp)); @@ -1592,7 +1668,7 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); - for (int d = 0; d < ndvas; d++) + for (d = 0; d < ndvas; d++) if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) break;