*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
#ifdef _KERNEL
extern vmem_t *zio_alloc_arena;
#endif
+extern int zfs_mg_alloc_failures;
/*
* An allocating zio is one that either currently has the DVA allocate
*/
#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
-boolean_t zio_requeue_io_start_cut_in_line = B_TRUE;
+int zio_requeue_io_start_cut_in_line = 1;
#ifdef ZFS_DEBUG
int zio_buf_debug_limit = 16384;
data_alloc_arena = zio_alloc_arena;
#endif
zio_cache = kmem_cache_create("zio_cache",
- sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM);
zio_link_cache = kmem_cache_create("zio_link_cache",
- sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM);
/*
* For small buffers, we want a cache for each multiple of
if (align != 0) {
char name[36];
+ int flags = zio_bulk_flags;
+
+ /*
+ * The smallest buffers (512b) are heavily used and
+ * experience a lot of churn. The slabs allocated
+ * for them are also relatively small (32K). Thus
+ * in over to avoid expensive calls to vmalloc() we
+ * make an exception to the usual slab allocation
+ * policy and force these buffers to be kmem backed.
+ */
+ if (size == (1 << SPA_MINBLOCKSHIFT))
+ flags |= KMC_KMEM;
+
(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
zio_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, NULL,
- (size > zio_buf_debug_limit ? KMC_NODEBUG : 0) |
- zio_bulk_flags);
+ align, NULL, NULL, NULL, NULL, NULL, flags);
(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
zio_data_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, data_alloc_arena,
- (size > zio_buf_debug_limit ? KMC_NODEBUG : 0) |
- zio_bulk_flags);
+ align, NULL, NULL, NULL, NULL,
+ data_alloc_arena, flags);
}
}
zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
}
+ /*
+ * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
+ * to fail 3 times per txg or 8 failures, whichever is greater.
+ */
+ zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
+
zio_inject_init();
}
zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
zio_transform_func_t *transform)
{
- zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
+ zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_PUSHPAGE);
zt->zt_orig_data = zio->io_data;
zt->zt_orig_size = zio->io_size;
void
zio_add_child(zio_t *pio, zio_t *cio)
{
- zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
+ zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_PUSHPAGE);
int w;
/*
ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
ASSERT(vd || stage == ZIO_STAGE_OPEN);
- zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
+ zio = kmem_cache_alloc(zio_cache, KM_PUSHPAGE);
bzero(zio, sizeof (zio_t));
mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
zio_add_child(pio, zio);
}
+ taskq_init_ent(&zio->io_tqent);
+
return (zio);
}
{
spa_t *spa = zio->io_spa;
zio_type_t t = zio->io_type;
- int flags = TQ_NOSLEEP | (cutinline ? TQ_FRONT : 0);
+ int flags = (cutinline ? TQ_FRONT : 0);
/*
* If we're a config writer or a probe, the normal issue and
ASSERT3U(q, <, ZIO_TASKQ_TYPES);
- while (taskq_dispatch(spa->spa_zio_taskq[t][q],
- (task_func_t *)zio_execute, zio, flags) == 0); /* do nothing */
+ /*
+ * NB: We are assuming that the zio can only be dispatched
+ * to a single taskq at a time. It would be a grievous error
+ * to dispatch the zio to another taskq at the same time.
+ */
+ ASSERT(taskq_empty_ent(&zio->io_tqent));
+ taskq_dispatch_ent(spa->spa_zio_taskq[t][q],
+ (task_func_t *)zio_execute, zio, flags, &zio->io_tqent);
}
static boolean_t
while (zio->io_stage < ZIO_STAGE_DONE) {
enum zio_stage pipeline = zio->io_pipeline;
enum zio_stage stage = zio->io_stage;
+ dsl_pool_t *dsl;
+ boolean_t cut;
int rv;
ASSERT(!MUTEX_HELD(&zio->io_lock));
ASSERT(stage <= ZIO_STAGE_DONE);
+ dsl = spa_get_dsl(zio->io_spa);
+ cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+ zio_requeue_io_start_cut_in_line : B_FALSE;
+
/*
* If we are in interrupt context and this pipeline stage
* will grab a config lock that is held across I/O,
* or may wait for an I/O that needs an interrupt thread
* to complete, issue async to avoid deadlock.
*
+ * If we are in the txg_sync_thread or being called
+ * during pool init issue async to minimize stack depth.
+ * Both of these call paths may be recursively called.
+ *
* For VDEV_IO_START, we cut in line so that the io will
* be sent to disk promptly.
*/
- if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
- zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
- boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
- zio_requeue_io_start_cut_in_line : B_FALSE;
+ if (((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
+ zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) ||
+ (dsl != NULL && dsl_pool_sync_context(dsl))) {
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
return;
}
ASSERT(*gnpp == NULL);
- gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
+ gn = kmem_zalloc(sizeof (*gn), KM_PUSHPAGE);
gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
*gnpp = gn;
metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = zio->io_bp;
int error;
+ int flags = 0;
if (zio->io_gang_leader == NULL) {
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+ /*
+ * The dump device does not support gang blocks so allocation on
+ * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
+ * the "fast" gang feature.
+ */
+ flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
+ flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
+ METASLAB_GANG_CHILD : 0;
error = metaslab_alloc(spa, mc, zio->io_size, bp,
- zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
if (error) {
+ spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
+ "size %llu, error %d", spa_name(spa), zio, zio->io_size,
+ error);
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
return (zio_write_gang_block(zio));
zio->io_error = error;
* Reexecution is potentially a huge amount of work.
* Hand it off to the otherwise-unused claim taskq.
*/
- (void) taskq_dispatch(
+ ASSERT(taskq_empty_ent(&zio->io_tqent));
+ (void) taskq_dispatch_ent(
zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
- (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
+ (task_func_t *)zio_reexecute, zio, 0,
+ &zio->io_tqent);
}
return (ZIO_PIPELINE_STOP);
}
MODULE_PARM_DESC(zio_bulk_flags, "Additional flags to pass to bulk buffers");
module_param(zio_delay_max, int, 0644);
-MODULE_PARM_DESC(zio_delay_max, "Max zio delay before posting an event (ms)");
+MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
+
+module_param(zio_requeue_io_start_cut_in_line, int, 0644);
+MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
#endif