X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fzio.c;h=c96442d0bd715af7b781ce6ca70ebdf80753a013;hb=30930fba219642cb1dadf1c8ef60ff799e3dc424;hp=2c38e10a1efaad1956507f7fc12a84d4a81dcd87;hpb=691f6ac4c2858d64afc2a0dc1bd2b8c041d68502;p=zfs.git

diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 2c38e10..c96442d 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -20,6 +20,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -79,6 +81,7 @@ int zio_delay_max = ZIO_DELAY_MAX;
 #ifdef _KERNEL
 extern vmem_t *zio_alloc_arena;
 #endif
+extern int zfs_mg_alloc_failures;
 
 /*
  * An allocating zio is one that either currently has the DVA allocate
@@ -86,7 +89,7 @@ extern vmem_t *zio_alloc_arena;
  */
 #define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 
-boolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
+int zio_requeue_io_start_cut_in_line = 1;
 
 #ifdef ZFS_DEBUG
 int zio_buf_debug_limit = 16384;
@@ -106,9 +109,9 @@ zio_init(void)
 	data_alloc_arena = zio_alloc_arena;
 #endif
 	zio_cache = kmem_cache_create("zio_cache",
-	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM);
 	zio_link_cache = kmem_cache_create("zio_link_cache",
-	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM);
 
 	/*
 	 * For small buffers, we want a cache for each multiple of
@@ -134,17 +137,27 @@ zio_init(void)
 
 		if (align != 0) {
 			char name[36];
+			int flags = zio_bulk_flags;
+
+			/*
+			 * The smallest buffers (512b) are heavily used and
+			 * experience a lot of churn.  The slabs allocated
+			 * for them are also relatively small (32K).  Thus
+			 * in over to avoid expensive calls to vmalloc() we
+			 * make an exception to the usual slab allocation
+			 * policy and force these buffers to be kmem backed.
+			 */
+			if (size == (1 << SPA_MINBLOCKSHIFT))
+				flags |= KMC_KMEM;
+
 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 			zio_buf_cache[c] = kmem_cache_create(name, size,
-			    align, NULL, NULL, NULL, NULL, NULL,
-			    (size > zio_buf_debug_limit ? KMC_NODEBUG : 0) |
-			    zio_bulk_flags);
+			    align, NULL, NULL, NULL, NULL, NULL, flags);
 
 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
-			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
-			    (size > zio_buf_debug_limit ? KMC_NODEBUG : 0) |
-			    zio_bulk_flags);
+			    align, NULL, NULL, NULL, NULL,
+			    data_alloc_arena, flags);
 		}
 	}
 
@@ -158,6 +171,12 @@ zio_init(void)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 
+	/*
+	 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
+	 * to fail 3 times per txg or 8 failures, whichever is greater.
+	 */
+	zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
+
 	zio_inject_init();
 }
 
@@ -552,6 +571,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 		zio_add_child(pio, zio);
 	}
 
+	taskq_init_ent(&zio->io_tqent);
+
 	return (zio);
 }
 
@@ -1055,7 +1076,7 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
 {
 	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
-	int flags = TQ_NOSLEEP | (cutinline ? TQ_FRONT : 0);
+	int flags = (cutinline ? TQ_FRONT : 0);
 
 	/*
 	 * If we're a config writer or a probe, the normal issue and
@@ -1080,8 +1101,14 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 
-	while (taskq_dispatch(spa->spa_zio_taskq[t][q],
-	    (task_func_t *)zio_execute, zio, flags) == 0); /* do nothing */
+	/*
+	 * NB: We are assuming that the zio can only be dispatched
+	 * to a single taskq at a time.  It would be a grievous error
+	 * to dispatch the zio to another taskq at the same time.
+	 */
+	ASSERT(taskq_empty_ent(&zio->io_tqent));
+	taskq_dispatch_ent(spa->spa_zio_taskq[t][q],
+	    (task_func_t *)zio_execute, zio, flags, &zio->io_tqent);
 }
 
 static boolean_t
@@ -1150,6 +1177,8 @@ __zio_execute(zio_t *zio)
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
+		dsl_pool_t *dsl;
+		boolean_t cut;
 		int rv;
 
 		ASSERT(!MUTEX_HELD(&zio->io_lock));
@@ -1162,19 +1191,26 @@ __zio_execute(zio_t *zio)
 
 		ASSERT(stage <= ZIO_STAGE_DONE);
 
+		dsl = spa_get_dsl(zio->io_spa);
+		cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+		    zio_requeue_io_start_cut_in_line : B_FALSE;
+
 		/*
 		 * If we are in interrupt context and this pipeline stage
 		 * will grab a config lock that is held across I/O,
 		 * or may wait for an I/O that needs an interrupt thread
 		 * to complete, issue async to avoid deadlock.
 		 *
+		 * If we are in the txg_sync_thread or being called
+		 * during pool init issue async to minimize stack depth.
+		 * Both of these call paths may be recursively called.
+		 *
 		 * For VDEV_IO_START, we cut in line so that the io will
 		 * be sent to disk promptly.
 		 */
-		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
-		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
-			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
-			    zio_requeue_io_start_cut_in_line : B_FALSE;
+		if (((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
+		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) ||
+		    (dsl != NULL && dsl_pool_sync_context(dsl))) {
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
@@ -2142,6 +2178,7 @@ zio_dva_allocate(zio_t *zio)
 	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = zio->io_bp;
 	int error;
+	int flags = 0;
 
 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
@@ -2154,10 +2191,21 @@ zio_dva_allocate(zio_t *zio)
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
+	/*
+	 * The dump device does not support gang blocks so allocation on
+	 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
+	 * the "fast" gang feature.
+	 */
+	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
+	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
+	    METASLAB_GANG_CHILD : 0;
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
-	    zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
+	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
 
 	if (error) {
+		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
+		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
+		    error);
 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
 			return (zio_write_gang_block(zio));
 		zio->io_error = error;
@@ -2908,9 +2956,11 @@ zio_done(zio_t *zio)
 			 * Reexecution is potentially a huge amount of work.
 			 * Hand it off to the otherwise-unused claim taskq.
 			 */
-			(void) taskq_dispatch(
+			ASSERT(taskq_empty_ent(&zio->io_tqent));
+			(void) taskq_dispatch_ent(
 			    zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
-			    (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
+			    (task_func_t *)zio_reexecute, zio, 0,
+			    &zio->io_tqent);
 		}
 		return (ZIO_PIPELINE_STOP);
 	}
@@ -3006,5 +3056,8 @@ module_param(zio_bulk_flags, int, 0644);
 MODULE_PARM_DESC(zio_bulk_flags, "Additional flags to pass to bulk buffers");
 
 module_param(zio_delay_max, int, 0644);
-MODULE_PARM_DESC(zio_delay_max, "Max zio delay before posting an event (ms)");
+MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
+
+module_param(zio_requeue_io_start_cut_in_line, int, 0644);
+MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
 #endif