*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
* ==========================================================================
*/
char *zio_type_name[ZIO_TYPES] = {
- "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
- "zio_ioctl"
+ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
};
/*
*/
kmem_cache_t *zio_cache;
kmem_cache_t *zio_link_cache;
+kmem_cache_t *zio_vdev_cache;
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+int zio_bulk_flags = 0;
+int zio_delay_max = ZIO_DELAY_MAX;
#ifdef _KERNEL
extern vmem_t *zio_alloc_arena;
#endif
+extern int zfs_mg_alloc_failures;
/*
* An allocating zio is one that either currently has the DVA allocate
*/
#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
-boolean_t zio_requeue_io_start_cut_in_line = B_TRUE;
+int zio_requeue_io_start_cut_in_line = 1;
#ifdef ZFS_DEBUG
int zio_buf_debug_limit = 16384;
int zio_buf_debug_limit = 0;
#endif
+static inline void __zio_execute(zio_t *zio);
+
+static int
+zio_cons(void *arg, void *unused, int kmflag)
+{
+ zio_t *zio = arg;
+
+ bzero(zio, sizeof (zio_t));
+
+ mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
+
+ list_create(&zio->io_parent_list, sizeof (zio_link_t),
+ offsetof(zio_link_t, zl_parent_node));
+ list_create(&zio->io_child_list, sizeof (zio_link_t),
+ offsetof(zio_link_t, zl_child_node));
+
+ return (0);
+}
+
+static void
+zio_dest(void *arg, void *unused)
+{
+ zio_t *zio = arg;
+
+ mutex_destroy(&zio->io_lock);
+ cv_destroy(&zio->io_cv);
+ list_destroy(&zio->io_parent_list);
+ list_destroy(&zio->io_child_list);
+}
+
void
zio_init(void)
{
#ifdef _KERNEL
data_alloc_arena = zio_alloc_arena;
#endif
- zio_cache = kmem_cache_create("zio_cache",
- sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
+ zio_cons, zio_dest, NULL, NULL, NULL, KMC_KMEM);
zio_link_cache = kmem_cache_create("zio_link_cache",
- sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM);
+ zio_vdev_cache = kmem_cache_create("zio_vdev_cache", sizeof(vdev_io_t),
+ PAGESIZE, NULL, NULL, NULL, NULL, NULL, KMC_VMEM);
/*
* For small buffers, we want a cache for each multiple of
if (align != 0) {
char name[36];
+ int flags = zio_bulk_flags;
+
+ /*
+ * The smallest buffers (512b) are heavily used and
+ * experience a lot of churn. The slabs allocated
+ * for them are also relatively small (32K). Thus
+ * in over to avoid expensive calls to vmalloc() we
+ * make an exception to the usual slab allocation
+ * policy and force these buffers to be kmem backed.
+ */
+ if (size == (1 << SPA_MINBLOCKSHIFT))
+ flags |= KMC_KMEM;
+
(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
zio_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, NULL,
- size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
+ align, NULL, NULL, NULL, NULL, NULL, flags);
(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
zio_data_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, data_alloc_arena,
- size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
+ align, NULL, NULL, NULL, NULL,
+ data_alloc_arena, flags);
}
}
zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
}
+ /*
+ * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
+ * to fail 3 times per txg or 8 failures, whichever is greater.
+ */
+ zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
+
zio_inject_init();
}
zio_data_buf_cache[c] = NULL;
}
+ kmem_cache_destroy(zio_vdev_cache);
kmem_cache_destroy(zio_link_cache);
kmem_cache_destroy(zio_cache);
ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
- return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
+ return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE | KM_NODEBUG));
}
/*
ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
- return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
+ return (kmem_cache_alloc(zio_data_buf_cache[c],
+ KM_PUSHPAGE | KM_NODEBUG));
}
void
}
/*
+ * Dedicated I/O buffers to ensure that memory fragmentation never prevents
+ * or significantly delays the issuing of a zio. These buffers are used
+ * to aggregate I/O and could be used for raidz stripes.
+ */
+void *
+zio_vdev_alloc(void)
+{
+ return (kmem_cache_alloc(zio_vdev_cache, KM_PUSHPAGE));
+}
+
+void
+zio_vdev_free(void *buf)
+{
+ kmem_cache_free(zio_vdev_cache, buf);
+
+}
+
+/*
* ==========================================================================
* Push and pop I/O transform buffers
* ==========================================================================
zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
zio_transform_func_t *transform)
{
- zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
+ zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_PUSHPAGE);
zt->zt_orig_data = zio->io_data;
zt->zt_orig_size = zio->io_size;
void
zio_add_child(zio_t *pio, zio_t *cio)
{
- zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
+ zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_PUSHPAGE);
+ int w;
/*
* Logical I/Os can have logical, gang, or vdev children.
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ for (w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
list_insert_head(&pio->io_child_list, zl);
return (waiting);
}
-static void
+__attribute__((always_inline))
+static inline void
zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
{
uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
if (--*countp == 0 && pio->io_stall == countp) {
pio->io_stall = NULL;
mutex_exit(&pio->io_lock);
- zio_execute(pio);
+ __zio_execute(pio);
} else {
mutex_exit(&pio->io_lock);
}
ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
ASSERT(vd || stage == ZIO_STAGE_OPEN);
- zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
- bzero(zio, sizeof (zio_t));
-
- mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
-
- list_create(&zio->io_parent_list, sizeof (zio_link_t),
- offsetof(zio_link_t, zl_parent_node));
- list_create(&zio->io_child_list, sizeof (zio_link_t),
- offsetof(zio_link_t, zl_child_node));
+ zio = kmem_cache_alloc(zio_cache, KM_PUSHPAGE);
if (vd != NULL)
zio->io_child_type = ZIO_CHILD_VDEV;
zio->io_child_type = ZIO_CHILD_LOGICAL;
if (bp != NULL) {
+ zio->io_logical = NULL;
zio->io_bp = (blkptr_t *)bp;
zio->io_bp_copy = *bp;
zio->io_bp_orig = *bp;
zio->io_logical = zio;
if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
pipeline |= ZIO_GANG_STAGES;
+ } else {
+ zio->io_logical = NULL;
+ zio->io_bp = NULL;
+ bzero(&zio->io_bp_copy, sizeof (blkptr_t));
+ bzero(&zio->io_bp_orig, sizeof (blkptr_t));
}
zio->io_spa = spa;
zio->io_txg = txg;
+ zio->io_ready = NULL;
zio->io_done = done;
zio->io_private = private;
+ zio->io_prev_space_delta = 0;
zio->io_type = type;
zio->io_priority = priority;
zio->io_vd = vd;
+ zio->io_vsd = NULL;
+ zio->io_vsd_ops = NULL;
zio->io_offset = offset;
+ zio->io_deadline = 0;
zio->io_orig_data = zio->io_data = data;
zio->io_orig_size = zio->io_size = size;
zio->io_orig_flags = zio->io_flags = flags;
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
+ bzero(&zio->io_prop, sizeof (zio_prop_t));
+ zio->io_cmd = 0;
+ zio->io_reexecute = 0;
+ zio->io_bp_override = NULL;
+ zio->io_walk_link = NULL;
+ zio->io_transform_stack = NULL;
+ zio->io_delay = 0;
+ zio->io_error = 0;
+ zio->io_child_count = 0;
+ zio->io_parent_count = 0;
+ zio->io_stall = NULL;
+ zio->io_gang_leader = NULL;
+ zio->io_gang_tree = NULL;
+ zio->io_executor = NULL;
+ zio->io_waiter = NULL;
+ zio->io_cksum_report = NULL;
+ zio->io_ena = 0;
+ bzero(zio->io_child_error, sizeof (int) * ZIO_CHILD_TYPES);
+ bzero(zio->io_children,
+ sizeof (uint64_t) * ZIO_CHILD_TYPES * ZIO_WAIT_TYPES);
+ bzero(&zio->io_bookmark, sizeof (zbookmark_t));
zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
zio_add_child(pio, zio);
}
+ taskq_init_ent(&zio->io_tqent);
+
return (zio);
}
static void
zio_destroy(zio_t *zio)
{
- list_destroy(&zio->io_parent_list);
- list_destroy(&zio->io_child_list);
- mutex_destroy(&zio->io_lock);
- cv_destroy(&zio->io_cv);
kmem_cache_free(zio_cache, zio);
}
*/
if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
pass > SYNC_PASS_REWRITE) {
- ASSERT(psize != 0);
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+ ASSERT(psize != 0);
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
} else {
{
spa_t *spa = zio->io_spa;
zio_type_t t = zio->io_type;
- int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0);
+ int flags = (cutinline ? TQ_FRONT : 0);
/*
* If we're a config writer or a probe, the normal issue and
q++;
ASSERT3U(q, <, ZIO_TASKQ_TYPES);
- (void) taskq_dispatch(spa->spa_zio_taskq[t][q],
- (task_func_t *)zio_execute, zio, flags);
+
+ /*
+ * NB: We are assuming that the zio can only be dispatched
+ * to a single taskq at a time. It would be a grievous error
+ * to dispatch the zio to another taskq at the same time.
+ */
+ ASSERT(taskq_empty_ent(&zio->io_tqent));
+ taskq_dispatch_ent(spa->spa_zio_taskq[t][q],
+ (task_func_t *)zio_execute, zio, flags, &zio->io_tqent);
}
static boolean_t
{
kthread_t *executor = zio->io_executor;
spa_t *spa = zio->io_spa;
+ zio_type_t t;
- for (zio_type_t t = 0; t < ZIO_TYPES; t++)
+ for (t = 0; t < ZIO_TYPES; t++)
if (taskq_member(spa->spa_zio_taskq[t][q], executor))
return (B_TRUE);
*/
static zio_pipe_stage_t *zio_pipeline[];
+/*
+ * zio_execute() is a wrapper around the static function
+ * __zio_execute() so that we can force __zio_execute() to be
+ * inlined. This reduces stack overhead which is important
+ * because __zio_execute() is called recursively in several zio
+ * code paths. zio_execute() itself cannot be inlined because
+ * it is externally visible.
+ */
void
zio_execute(zio_t *zio)
{
+ __zio_execute(zio);
+}
+
+__attribute__((always_inline))
+static inline void
+__zio_execute(zio_t *zio)
+{
zio->io_executor = curthread;
while (zio->io_stage < ZIO_STAGE_DONE) {
enum zio_stage pipeline = zio->io_pipeline;
enum zio_stage stage = zio->io_stage;
+ dsl_pool_t *dsl;
+ boolean_t cut;
int rv;
ASSERT(!MUTEX_HELD(&zio->io_lock));
ASSERT(stage <= ZIO_STAGE_DONE);
+ dsl = spa_get_dsl(zio->io_spa);
+ cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+ zio_requeue_io_start_cut_in_line : B_FALSE;
+
/*
* If we are in interrupt context and this pipeline stage
* will grab a config lock that is held across I/O,
* or may wait for an I/O that needs an interrupt thread
* to complete, issue async to avoid deadlock.
*
+ * If we are in the txg_sync_thread or being called
+ * during pool init issue async to minimize stack depth.
+ * Both of these call paths may be recursively called.
+ *
* For VDEV_IO_START, we cut in line so that the io will
* be sent to disk promptly.
*/
- if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
- zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
- boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
- zio_requeue_io_start_cut_in_line : B_FALSE;
+ if (((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
+ zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) ||
+ (dsl != NULL && dsl_pool_sync_context(dsl))) {
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
return;
}
}
}
+
/*
* ==========================================================================
* Initiate I/O, either sync or async
zio->io_waiter = curthread;
- zio_execute(zio);
+ __zio_execute(zio);
mutex_enter(&zio->io_lock);
while (zio->io_executor != NULL)
zio_add_child(spa->spa_async_zio_root, zio);
}
- zio_execute(zio);
+ __zio_execute(zio);
}
/*
zio_reexecute(zio_t *pio)
{
zio_t *cio, *cio_next;
+ int c, w;
ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
pio->io_pipeline = pio->io_orig_pipeline;
pio->io_reexecute = 0;
pio->io_error = 0;
- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ for (w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_state[w] = 0;
- for (int c = 0; c < ZIO_CHILD_TYPES; c++)
+ for (c = 0; c < ZIO_CHILD_TYPES; c++)
pio->io_child_error[c] = 0;
if (IO_IS_ALLOCATING(pio))
for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio);
mutex_enter(&pio->io_lock);
- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ for (w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[cio->io_child_type][w]++;
mutex_exit(&pio->io_lock);
zio_reexecute(cio);
* responsibility of the caller to wait on him.
*/
if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
- zio_execute(pio);
+ __zio_execute(pio);
}
void
ASSERT(*gnpp == NULL);
- gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
+ gn = kmem_zalloc(sizeof (*gn), KM_PUSHPAGE);
gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
*gnpp = gn;
zio_gang_node_free(zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = *gnpp;
+ int g;
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+ for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
ASSERT(gn->gn_child[g] == NULL);
zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
zio_gang_tree_free(zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = *gnpp;
+ int g;
if (gn == NULL)
return;
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+ for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
zio_gang_tree_free(&gn->gn_child[g]);
zio_gang_node_free(gnpp);
zio_t *gio = zio->io_gang_leader;
zio_gang_node_t *gn = zio->io_private;
blkptr_t *bp = zio->io_bp;
+ int g;
ASSERT(gio == zio_unique_parent(zio));
ASSERT(zio->io_child_count == 0);
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (!BP_IS_GANG(gbp))
continue;
{
zio_t *gio = pio->io_gang_leader;
zio_t *zio;
+ int g;
ASSERT(BP_IS_GANG(bp) == !!gn);
ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
if (gn != NULL) {
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (BP_IS_HOLE(gbp))
continue;
zio_write_gang_member_ready(zio_t *zio)
{
zio_t *pio = zio_unique_parent(zio);
- zio_t *gio = zio->io_gang_leader;
+ ASSERTV(zio_t *gio = zio->io_gang_leader;)
dva_t *cdva = zio->io_bp->blk_dva;
dva_t *pdva = pio->io_bp->blk_dva;
uint64_t asize;
+ int d;
if (BP_IS_HOLE(zio->io_bp))
return;
ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
mutex_enter(&pio->io_lock);
- for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
+ for (d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
ASSERT(DVA_GET_GANG(&pdva[d]));
asize = DVA_GET_ASIZE(&pdva[d]);
asize += DVA_GET_ASIZE(&cdva[d]);
int copies = gio->io_prop.zp_copies;
int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
zio_prop_t zp;
- int error;
+ int g, error;
error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
/*
* Create and nowait the gang children.
*/
- for (int g = 0; resid != 0; resid -= lsize, g++) {
+ for (g = 0; resid != 0; resid -= lsize, g++) {
lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
SPA_MINBLOCKSIZE);
ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
zio_ddt_read_start(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
+ int p;
ASSERT(BP_GET_DEDUP(bp));
ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
if (ddp_self == NULL)
return (ZIO_PIPELINE_CONTINUE);
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
continue;
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
{
spa_t *spa = zio->io_spa;
+ int p;
/*
* Note: we compare the original data, not the transformed data,
* pushed the I/O transforms. That's an important optimization
* because otherwise we'd compress/encrypt all dmu_sync() data twice.
*/
- for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
zio_t *lio = dde->dde_lead_zio[p];
if (lio != NULL) {
}
}
- for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
ddt_phys_t *ddp = &dde->dde_phys[p];
if (ddp->ddp_phys_birth != 0) {
zio_ddt_ditto_write_done(zio_t *zio)
{
int p = DDT_PHYS_DITTO;
- zio_prop_t *zp = &zio->io_prop;
blkptr_t *bp = zio->io_bp;
ddt_t *ddt = ddt_select(zio->io_spa, bp);
ddt_entry_t *dde = zio->io_private;
ddt_phys_t *ddp = &dde->dde_phys[p];
ddt_key_t *ddk = &dde->dde_key;
+ ASSERTV(zio_prop_t *zp = &zio->io_prop);
ddt_enter(ddt);
metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = zio->io_bp;
int error;
+ int flags = 0;
if (zio->io_gang_leader == NULL) {
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+ /*
+ * The dump device does not support gang blocks so allocation on
+ * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
+ * the "fast" gang feature.
+ */
+ flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
+ flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
+ METASLAB_GANG_CHILD : 0;
error = metaslab_alloc(spa, mc, zio->io_size, bp,
- zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
if (error) {
+ spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
+ "size %llu, error %d", spa_name(spa), zio, zio->io_size,
+ error);
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
return (zio_write_gang_block(zio));
zio->io_error = error;
static void
zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
{
+ int g;
+
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
ASSERT(zio->io_bp_override == NULL);
metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
if (gn != NULL) {
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
zio_dva_unallocate(zio, gn->gn_child[g],
&gn->gn_gbh->zg_blkptr[g]);
}
ASSERT(txg > spa_syncing_txg(spa));
- if (use_slog)
+ /*
+ * ZIL blocks are always contiguous (i.e. not gang blocks) so we
+ * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
+ * when allocating them.
+ */
+ if (use_slog) {
error = metaslab_alloc(spa, spa_log_class(spa), size,
- new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
+ new_bp, 1, txg, old_bp,
+ METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
+ }
- if (error)
+ if (error) {
error = metaslab_alloc(spa, spa_normal_class(spa), size,
- new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
+ new_bp, 1, txg, old_bp,
+ METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
+ }
if (error == 0) {
BP_SET_LSIZE(new_bp, size);
align = 1ULL << vd->vdev_top->vdev_ashift;
- if (P2PHASE(zio->io_size, align) != 0) {
+ /*
+ * On Linux, we don't care about read alignment. The backing block
+ * device driver will take care of that for us.
+ * The only exception is raidz, which needs a full block for parity.
+ */
+ if (P2PHASE(zio->io_size, align) != 0 &&
+ (zio->io_type != ZIO_TYPE_READ ||
+ vd->vdev_ops == &vdev_raidz_ops)) {
uint64_t asize = P2ROUNDUP(zio->io_size, align);
char *abuf = zio_buf_alloc(asize);
- ASSERT(vd == vd->vdev_top);
if (zio->io_type == ZIO_TYPE_WRITE) {
bcopy(zio->io_data, abuf, zio->io_size);
bzero(abuf + zio->io_size, asize - zio->io_size);
}
zio_push_transform(zio, abuf, asize, asize, zio_subblock);
+ ASSERT(P2PHASE(zio->io_size, align) == 0);
}
ASSERT(P2PHASE(zio->io_offset, align) == 0);
- ASSERT(P2PHASE(zio->io_size, align) == 0);
VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
/*
static int
zio_done(zio_t *zio)
{
- spa_t *spa = zio->io_spa;
- zio_t *lio = zio->io_logical;
- blkptr_t *bp = zio->io_bp;
- vdev_t *vd = zio->io_vd;
- uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
+ int c, w;
/*
* If our children haven't all completed,
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
- for (int c = 0; c < ZIO_CHILD_TYPES; c++)
- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ for (c = 0; c < ZIO_CHILD_TYPES; c++)
+ for (w = 0; w < ZIO_WAIT_TYPES; w++)
ASSERT(zio->io_children[c][w] == 0);
- if (bp != NULL) {
- ASSERT(bp->blk_pad[0] == 0);
- ASSERT(bp->blk_pad[1] == 0);
- ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
- (bp == zio_unique_parent(zio)->io_bp));
- if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+ if (zio->io_bp != NULL) {
+ ASSERT(zio->io_bp->blk_pad[0] == 0);
+ ASSERT(zio->io_bp->blk_pad[1] == 0);
+ ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
+ (zio->io_bp == zio_unique_parent(zio)->io_bp));
+ if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
zio->io_bp_override == NULL &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
- ASSERT(!BP_SHOULD_BYTESWAP(bp));
- ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
- ASSERT(BP_COUNT_GANG(bp) == 0 ||
- (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
+ ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp));
+ ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
+ ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
+ (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp)));
}
}
while (zio->io_cksum_report != NULL) {
zio_cksum_report_t *zcr = zio->io_cksum_report;
uint64_t align = zcr->zcr_align;
- uint64_t asize = P2ROUNDUP(psize, align);
+ uint64_t asize = P2ROUNDUP(zio->io_size, align);
char *abuf = zio->io_data;
- if (asize != psize) {
+ if (asize != zio->io_size) {
abuf = zio_buf_alloc(asize);
- bcopy(zio->io_data, abuf, psize);
- bzero(abuf + psize, asize - psize);
+ bcopy(zio->io_data, abuf, zio->io_size);
+ bzero(abuf + zio->io_size, asize - zio->io_size);
}
zio->io_cksum_report = zcr->zcr_next;
zcr->zcr_finish(zcr, abuf);
zfs_ereport_free_checksum(zcr);
- if (asize != psize)
+ if (asize != zio->io_size)
zio_buf_free(abuf, asize);
}
}
zio_pop_transforms(zio); /* note: may set zio->io_error */
- vdev_stat_update(zio, psize);
+ vdev_stat_update(zio, zio->io_size);
+
+ /*
+ * If this I/O is attached to a particular vdev is slow, exeeding
+ * 30 seconds to complete, post an error described the I/O delay.
+ * We ignore these errors if the device is currently unavailable.
+ */
+ if (zio->io_delay >= zio_delay_max) {
+ if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
+ zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
+ zio->io_vd, zio, 0, 0);
+ }
if (zio->io_error) {
/*
* at the block level. We ignore these errors if the
* device is currently unavailable.
*/
- if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
- zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
+ if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
+ !vdev_is_dead(zio->io_vd))
+ zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
+ zio->io_vd, zio, 0, 0);
if ((zio->io_error == EIO || !(zio->io_flags &
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
- zio == lio) {
+ zio == zio->io_logical) {
/*
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
*/
- spa_log_error(spa, zio);
- zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
+ spa_log_error(zio->io_spa, zio);
+ zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, zio,
0, 0);
}
}
- if (zio->io_error && zio == lio) {
+ if (zio->io_error && zio == zio->io_logical) {
/*
* Determine whether zio should be reexecuted. This will
* propagate all the way to the root via zio_notify_parent().
*/
- ASSERT(vd == NULL && bp != NULL);
+ ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
if (IO_IS_ALLOCATING(zio) &&
zio->io_type == ZIO_TYPE_FREE) &&
!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
zio->io_error == ENXIO &&
- spa_load_state(spa) == SPA_LOAD_NONE &&
- spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
+ spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
+ spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
if ((zio->io_error || zio->io_reexecute) &&
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
!(zio->io_flags & ZIO_FLAG_IO_REWRITE))
- zio_dva_unallocate(zio, zio->io_gang_tree, bp);
+ zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
zio_gang_tree_free(&zio->io_gang_tree);
* We'd fail again if we reexecuted now, so suspend
* until conditions improve (e.g. device comes online).
*/
- zio_suspend(spa, zio);
+ zio_suspend(zio->io_spa, zio);
} else {
/*
* Reexecution is potentially a huge amount of work.
* Hand it off to the otherwise-unused claim taskq.
*/
- (void) taskq_dispatch(
- spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
- (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
+ ASSERT(taskq_empty_ent(&zio->io_tqent));
+ (void) taskq_dispatch_ent(
+ zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
+ (task_func_t *)zio_reexecute, zio, 0,
+ &zio->io_tqent);
}
return (ZIO_PIPELINE_STOP);
}
zio_checksum_verify,
zio_done
};
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+/* Fault injection */
+EXPORT_SYMBOL(zio_injection_enabled);
+EXPORT_SYMBOL(zio_inject_fault);
+EXPORT_SYMBOL(zio_inject_list_next);
+EXPORT_SYMBOL(zio_clear_fault);
+EXPORT_SYMBOL(zio_handle_fault_injection);
+EXPORT_SYMBOL(zio_handle_device_injection);
+EXPORT_SYMBOL(zio_handle_label_injection);
+EXPORT_SYMBOL(zio_priority_table);
+EXPORT_SYMBOL(zio_type_name);
+
+module_param(zio_bulk_flags, int, 0644);
+MODULE_PARM_DESC(zio_bulk_flags, "Additional flags to pass to bulk buffers");
+
+module_param(zio_delay_max, int, 0644);
+MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
+
+module_param(zio_requeue_io_start_cut_in_line, int, 0644);
+MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
+#endif