X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fmetaslab.c;h=d199921b7edf1a17d6bc7ab2c20275941a340d91;hb=1c24b699b0c7590e135f4701b50a4c933ebe0499;hp=17b4b12c4ee4db8f4aa0b5af2e915e9d459294c2;hpb=428870ff734fdaccc342b33fc53cf94724409a46;p=zfs.git

diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 17b4b12..d199921 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -30,10 +31,31 @@
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 
+#define WITH_DF_BLOCK_ALLOCATOR
+
+/*
+ * Allow allocations to switch to gang blocks quickly. We do this to
+ * avoid having to load lots of space_maps in a given txg. There are,
+ * however, some cases where we want to avoid "fast" ganging and instead
+ * we want to do an exhaustive search of all metaslabs on this device.
+ * Currently we don't allow any gang, zil, or dump device related allocations
+ * to "fast" gang.
+ */
+#define	CAN_FASTGANG(flags) \
+	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
+	METASLAB_GANG_AVOID)))
+
 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
+ * This value defines the number of allowed allocation failures per vdev.
+ * If a device reaches this threshold in a given txg then we consider skipping
+ * allocations on that device.
+ */
+int zfs_mg_alloc_failures;
+
+/*
  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
  */
 static int metaslab_debug = 0;
@@ -80,11 +102,12 @@ metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
-	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_PUSHPAGE);
 
 	mc->mc_spa = spa;
 	mc->mc_rotor = NULL;
 	mc->mc_ops = ops;
+	mutex_init(&mc->mc_fastwrite_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	return (mc);
 }
@@ -98,6 +121,7 @@ metaslab_class_destroy(metaslab_class_t *mc)
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);
 
+	mutex_destroy(&mc->mc_fastwrite_lock);
 	kmem_free(mc, sizeof (metaslab_class_t));
 }
 
@@ -195,7 +219,7 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 {
 	metaslab_group_t *mg;
 
-	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
+	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_PUSHPAGE);
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
@@ -350,6 +374,9 @@ metaslab_segsize_compare(const void *x1, const void *x2)
 	return (0);
 }
 
+#if defined(WITH_FF_BLOCK_ALLOCATOR) || \
+    defined(WITH_DF_BLOCK_ALLOCATOR) || \
+    defined(WITH_CDF_BLOCK_ALLOCATOR)
 /*
  * This is a helper function that can be used by the allocator to find
  * a suitable block to allocate. This will search the specified AVL
@@ -389,6 +416,7 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
 	*cursor = 0;
 	return (metaslab_block_picker(t, cursor, size, align));
 }
+#endif /* WITH_FF/DF/CDF_BLOCK_ALLOCATOR */
 
 static void
 metaslab_pp_load(space_map_t *sm)
@@ -396,9 +424,9 @@ metaslab_pp_load(space_map_t *sm)
 	space_seg_t *ss;
 
 	ASSERT(sm->sm_ppd == NULL);
-	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_PUSHPAGE);
 
-	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_PUSHPAGE);
 	avl_create(sm->sm_pp_root, metaslab_segsize_compare,
 	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
 
@@ -452,6 +480,7 @@ metaslab_pp_maxsize(space_map_t *sm)
 	return (ss->ss_end - ss->ss_start);
 }
 
+#if defined(WITH_FF_BLOCK_ALLOCATOR)
 /*
  * ==========================================================================
  * The first-fit block allocator
@@ -484,6 +513,10 @@ static space_map_ops_t metaslab_ff_ops = {
 	metaslab_ff_fragmented
 };
 
+space_map_ops_t *zfs_metaslab_ops = &metaslab_ff_ops;
+#endif /* WITH_FF_BLOCK_ALLOCATOR */
+
+#if defined(WITH_DF_BLOCK_ALLOCATOR)
 /*
  * ==========================================================================
  * Dynamic block allocator -
@@ -543,11 +576,15 @@ static space_map_ops_t metaslab_df_ops = {
 	metaslab_df_fragmented
 };
 
+space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+#endif /* WITH_DF_BLOCK_ALLOCATOR */
+
 /*
  * ==========================================================================
  * Other experimental allocators
  * ==========================================================================
  */
+#if defined(WITH_CDF_BLOCK_ALLOCATOR)
 static uint64_t
 metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
 {
@@ -607,6 +644,10 @@ static space_map_ops_t metaslab_cdf_ops = {
 	metaslab_cdf_fragmented
 };
 
+space_map_ops_t *zfs_metaslab_ops = &metaslab_cdf_ops;
+#endif /* WITH_CDF_BLOCK_ALLOCATOR */
+
+#if defined(WITH_NDF_BLOCK_ALLOCATOR)
 uint64_t metaslab_ndf_clump_shift = 4;
 
 static uint64_t
@@ -672,6 +713,7 @@ static space_map_ops_t metaslab_ndf_ops = {
 };
 
 space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
+#endif /* WITH_NDF_BLOCK_ALLOCATOR */
 
 /*
  * ==========================================================================
@@ -685,7 +727,7 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
 	vdev_t *vd = mg->mg_vd;
 	metaslab_t *msp;
 
-	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+	msp = kmem_zalloc(sizeof (metaslab_t), KM_PUSHPAGE);
 	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	msp->ms_smo_syncing = *smo;
@@ -730,6 +772,7 @@ void
 metaslab_fini(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
+	int t;
 
 	vdev_space_update(mg->mg_vd,
 	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
@@ -741,12 +784,12 @@ metaslab_fini(metaslab_t *msp)
 	space_map_unload(&msp->ms_map);
 	space_map_destroy(&msp->ms_map);
 
-	for (int t = 0; t < TXG_SIZE; t++) {
+	for (t = 0; t < TXG_SIZE; t++) {
 		space_map_destroy(&msp->ms_allocmap[t]);
 		space_map_destroy(&msp->ms_freemap[t]);
 	}
 
-	for (int t = 0; t < TXG_DEFER_SIZE; t++)
+	for (t = 0; t < TXG_DEFER_SIZE; t++)
 		space_map_destroy(&msp->ms_defermap[t]);
 
 	ASSERT3S(msp->ms_deferspace, ==, 0);
@@ -844,11 +887,12 @@ metaslab_prefetch(metaslab_group_t *mg)
 }
 
 static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	space_map_t *sm = &msp->ms_map;
 	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
+	int t;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
@@ -862,7 +906,7 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
 				metaslab_group_sort(msp->ms_group, msp, 0);
 				return (error);
 			}
-			for (int t = 0; t < TXG_DEFER_SIZE; t++)
+			for (t = 0; t < TXG_DEFER_SIZE; t++)
 				space_map_walk(&msp->ms_defermap[t],
 				    space_map_claim, sm);
 
@@ -877,13 +921,6 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
 			mutex_exit(&mg->mg_lock);
 		}
 
-		/*
-		 * If we were able to load the map then make sure
-		 * that this map is still able to satisfy our request.
-		 */
-		if (msp->ms_weight < size)
-			return (ENOSPC);
-
 		metaslab_group_sort(msp->ms_group, msp,
 		    msp->ms_weight | activation_weight);
 	}
@@ -922,6 +959,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	space_map_obj_t *smo = &msp->ms_smo_syncing;
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
+	int t;
 
 	ASSERT(!vd->vdev_ishole);
 
@@ -977,11 +1015,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		space_map_walk(sm, space_map_remove, allocmap);
 		space_map_walk(freed_map, space_map_remove, allocmap);
 
-		for (int t = 0; t < TXG_DEFER_SIZE; t++)
+		for (t = 0; t < TXG_DEFER_SIZE; t++)
 			space_map_walk(&msp->ms_defermap[t],
 			    space_map_remove, allocmap);
 
-		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
+		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
 			space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
 			    space_map_remove, allocmap);
 
@@ -1019,6 +1057,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	int64_t alloc_delta, defer_delta;
+	int t;
 
 	ASSERT(!vd->vdev_ishole);
 
@@ -1029,14 +1068,14 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	 * allocmaps and freemaps and add its capacity to the vdev.
 	 */
 	if (freed_map->sm_size == 0) {
-		for (int t = 0; t < TXG_SIZE; t++) {
+		for (t = 0; t < TXG_SIZE; t++) {
 			space_map_create(&msp->ms_allocmap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
 			space_map_create(&msp->ms_freemap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
 		}
 
-		for (int t = 0; t < TXG_DEFER_SIZE; t++)
+		for (t = 0; t < TXG_DEFER_SIZE; t++)
 			space_map_create(&msp->ms_defermap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
 
@@ -1082,7 +1121,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 		int evictable = 1;
 
-		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
+		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
 			if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
 				evictable = 0;
 
@@ -1099,12 +1138,14 @@ void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
+	int64_t failures = mg->mg_alloc_failures;
+	int m;
 
 	/*
 	 * Re-evaluate all metaslabs which have lower offsets than the
 	 * bonus area.
 	 */
-	for (int m = 0; m < vd->vdev_ms_count; m++) {
+	for (m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
 
 		if (msp->ms_map.sm_start > mg->mg_bonus_area)
@@ -1115,6 +1156,8 @@ metaslab_sync_reassess(metaslab_group_t *mg)
 		mutex_exit(&msp->ms_lock);
 	}
 
+	atomic_add_64(&mg->mg_alloc_failures, -failures);
+
 	/*
 	 * Prefetch the next potential metaslabs
 	 */
@@ -1139,9 +1182,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
 }
 
 static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
-    uint64_t min_distance, dva_t *dva, int d)
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
+    uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
 {
+	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
@@ -1162,11 +1206,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 
 		mutex_enter(&mg->mg_lock);
 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
-			if (msp->ms_weight < size) {
+			if (msp->ms_weight < asize) {
+				spa_dbgmsg(spa, "%s: failed to meet weight "
+				    "requirement: vdev %llu, txg %llu, mg %p, "
+				    "msp %p, psize %llu, asize %llu, "
+				    "failures %llu, weight %llu",
+				    spa_name(spa), mg->mg_vd->vdev_id, txg,
+				    mg, msp, psize, asize,
+				    mg->mg_alloc_failures, msp->ms_weight);
 				mutex_exit(&mg->mg_lock);
 				return (-1ULL);
 			}
-
 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
 				break;
@@ -1185,6 +1235,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 		if (msp == NULL)
 			return (-1ULL);
 
+		/*
+		 * If we've already reached the allowable number of failed
+		 * allocation attempts on this metaslab group then we
+		 * consider skipping it. We skip it only if we're allowed
+		 * to "fast" gang, the physical size is larger than
+		 * a gang block, and we're attempting to allocate from
+		 * the primary metaslab.
+		 */
+		if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
+		    CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
+		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
+			spa_dbgmsg(spa, "%s: skipping metaslab group: "
+			    "vdev %llu, txg %llu, mg %p, psize %llu, "
+			    "asize %llu, failures %llu", spa_name(spa),
+			    mg->mg_vd->vdev_id, txg, mg, psize, asize,
+			    mg->mg_alloc_failures);
+			return (-1ULL);
+		}
+
 		mutex_enter(&msp->ms_lock);
 
 		/*
@@ -1193,7 +1262,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock.
 		 */
-		if (msp->ms_weight < size || (was_active &&
+		if (msp->ms_weight < asize || (was_active &&
 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
 			mutex_exit(&msp->ms_lock);
@@ -1208,14 +1277,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 			continue;
 		}
 
-		if (metaslab_activate(msp, activation_weight, size) != 0) {
+		if (metaslab_activate(msp, activation_weight) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
-		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
+		if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
 			break;
 
+		atomic_inc_64(&mg->mg_alloc_failures);
+
 		metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
 
 		mutex_exit(&msp->ms_lock);
@@ -1224,7 +1295,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
 
 	mutex_exit(&msp->ms_lock);
 
@@ -1238,7 +1309,7 @@ static int
 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
 {
-	metaslab_group_t *mg, *rotor;
+	metaslab_group_t *mg, *fast_mg, *rotor;
 	vdev_t *vd;
 	int dshift = 3;
 	int all_zero;
@@ -1256,6 +1327,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
 		return (ENOSPC);
 
+	if (flags & METASLAB_FASTWRITE)
+		mutex_enter(&mc->mc_fastwrite_lock);
+
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
 	 * Note that there's no locking on mc_rotor or mc_aliquot because
@@ -1298,6 +1372,15 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
+	} else if (flags & METASLAB_FASTWRITE) {
+		mg = fast_mg = mc->mc_rotor;
+
+		do {
+			if (fast_mg->mg_vd->vdev_pending_fastwrite <
+			    mg->mg_vd->vdev_pending_fastwrite)
+				mg = fast_mg;
+		} while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
+
 	} else {
 		mg = mc->mc_rotor;
 	}
@@ -1351,7 +1434,8 @@ top:
 		asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
-		offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
+		    dva, d, flags);
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
@@ -1363,21 +1447,28 @@ top:
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vu, cu;
 
-				/*
-				 * Determine percent used in units of 0..1024.
-				 * (This is just to avoid floating point.)
-				 */
-				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
-				cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
+				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
+				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
 
 				/*
-				 * Bias by at most +/- 25% of the aliquot.
+				 * Calculate how much more or less we should
+				 * try to allocate from this device during
+				 * this iteration around the rotor.
+				 * For example, if a device is 80% full
+				 * and the pool is 20% full then we should
+				 * reduce allocations by 60% on this device.
+				 *
+				 * mg_bias = (20 - 80) * 512K / 100 = -307K
+				 *
+				 * This reduces allocations by 307K for this
+				 * iteration.
 				 */
 				mg->mg_bias = ((cu - vu) *
-				    (int64_t)mg->mg_aliquot) / (1024 * 4);
+				    (int64_t)mg->mg_aliquot) / 100;
 			}
 
-			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
+			if ((flags & METASLAB_FASTWRITE) ||
+			    atomic_add_64_nv(&mc->mc_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
 				mc->mc_rotor = mg->mg_next;
 				mc->mc_aliquot = 0;
@@ -1388,6 +1479,12 @@ top:
 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
 			DVA_SET_ASIZE(&dva[d], asize);
 
+			if (flags & METASLAB_FASTWRITE) {
+				atomic_add_64(&vd->vdev_pending_fastwrite,
+				    psize);
+				mutex_exit(&mc->mc_fastwrite_lock);
+			}
+
 			return (0);
 		}
 next:
@@ -1409,6 +1506,8 @@ next:
 
 	bzero(&dva[d], sizeof (dva_t));
 
+	if (flags & METASLAB_FASTWRITE)
+		mutex_exit(&mc->mc_fastwrite_lock);
 	return (ENOSPC);
 }
 
@@ -1488,7 +1587,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 	mutex_enter(&msp->ms_lock);
 
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
-		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 
 	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
 		error = ENOENT;
@@ -1517,7 +1616,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = hintbp->blk_dva;
-	int error = 0;
+	int d, error = 0;
 
 	ASSERT(bp->blk_birth == 0);
 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
@@ -1533,7 +1632,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 	ASSERT(BP_GET_NDVAS(bp) == 0);
 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
 
-	for (int d = 0; d < ndvas; d++) {
+	for (d = 0; d < ndvas; d++) {
 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
 		    txg, flags);
 		if (error) {
@@ -1559,14 +1658,14 @@ void
 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 {
 	const dva_t *dva = bp->blk_dva;
-	int ndvas = BP_GET_NDVAS(bp);
+	int d, ndvas = BP_GET_NDVAS(bp);
 
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
 
 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
-	for (int d = 0; d < ndvas; d++)
+	for (d = 0; d < ndvas; d++)
 		metaslab_free_dva(spa, &dva[d], txg, now);
 
 	spa_config_exit(spa, SCL_FREE, FTAG);
@@ -1577,7 +1676,7 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
-	int error = 0;
+	int d, error = 0;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
@@ -1592,7 +1691,7 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
-	for (int d = 0; d < ndvas; d++)
+	for (d = 0; d < ndvas; d++)
 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
 			break;
 
@@ -1602,3 +1701,48 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 
 	return (error);
 }
+
+void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+	uint64_t psize = BP_GET_PSIZE(bp);
+	int d;
+	vdev_t *vd;
+
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(psize > 0);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	for (d = 0; d < ndvas; d++) {
+		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
+			continue;
+		atomic_add_64(&vd->vdev_pending_fastwrite, psize);
+	}
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+}
+
+void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+	uint64_t psize = BP_GET_PSIZE(bp);
+	int d;
+	vdev_t *vd;
+
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(psize > 0);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	for (d = 0; d < ndvas; d++) {
+		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
+			continue;
+		ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
+		atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
+	}
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+}