*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
*/
*/
kmem_cache_t *zio_cache;
kmem_cache_t *zio_link_cache;
+kmem_cache_t *zio_vdev_cache;
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
int zio_bulk_flags = 0;
static inline void __zio_execute(zio_t *zio);
+static int
+zio_cons(void *arg, void *unused, int kmflag)
+{
+ zio_t *zio = arg;
+
+ bzero(zio, sizeof (zio_t));
+
+ mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
+
+ list_create(&zio->io_parent_list, sizeof (zio_link_t),
+ offsetof(zio_link_t, zl_parent_node));
+ list_create(&zio->io_child_list, sizeof (zio_link_t),
+ offsetof(zio_link_t, zl_child_node));
+
+ return (0);
+}
+
+static void
+zio_dest(void *arg, void *unused)
+{
+ zio_t *zio = arg;
+
+ mutex_destroy(&zio->io_lock);
+ cv_destroy(&zio->io_cv);
+ list_destroy(&zio->io_parent_list);
+ list_destroy(&zio->io_child_list);
+}
+
void
zio_init(void)
{
#ifdef _KERNEL
data_alloc_arena = zio_alloc_arena;
#endif
- zio_cache = kmem_cache_create("zio_cache",
- sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM);
+ zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
+ zio_cons, zio_dest, NULL, NULL, NULL, KMC_KMEM);
zio_link_cache = kmem_cache_create("zio_link_cache",
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM);
+ zio_vdev_cache = kmem_cache_create("zio_vdev_cache", sizeof(vdev_io_t),
+ PAGESIZE, NULL, NULL, NULL, NULL, NULL, KMC_VMEM);
/*
* For small buffers, we want a cache for each multiple of
zio_data_buf_cache[c] = NULL;
}
+ kmem_cache_destroy(zio_vdev_cache);
kmem_cache_destroy(zio_link_cache);
kmem_cache_destroy(zio_cache);
ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
- return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
+ return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE | KM_NODEBUG));
}
/*
ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
- return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
+ return (kmem_cache_alloc(zio_data_buf_cache[c],
+ KM_PUSHPAGE | KM_NODEBUG));
}
void
}
/*
+ * Dedicated I/O buffers to ensure that memory fragmentation never prevents
+ * or significantly delays the issuing of a zio. These buffers are used
+ * to aggregate I/O and could be used for raidz stripes.
+ */
+void *
+zio_vdev_alloc(void)
+{
+ return (kmem_cache_alloc(zio_vdev_cache, KM_PUSHPAGE));
+}
+
+void
+zio_vdev_free(void *buf)
+{
+ kmem_cache_free(zio_vdev_cache, buf);
+
+}
+
+/*
* ==========================================================================
* Push and pop I/O transform buffers
* ==========================================================================
ASSERT(vd || stage == ZIO_STAGE_OPEN);
zio = kmem_cache_alloc(zio_cache, KM_PUSHPAGE);
- bzero(zio, sizeof (zio_t));
-
- mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
-
- list_create(&zio->io_parent_list, sizeof (zio_link_t),
- offsetof(zio_link_t, zl_parent_node));
- list_create(&zio->io_child_list, sizeof (zio_link_t),
- offsetof(zio_link_t, zl_child_node));
if (vd != NULL)
zio->io_child_type = ZIO_CHILD_VDEV;
zio->io_child_type = ZIO_CHILD_LOGICAL;
if (bp != NULL) {
+ zio->io_logical = NULL;
zio->io_bp = (blkptr_t *)bp;
zio->io_bp_copy = *bp;
zio->io_bp_orig = *bp;
zio->io_logical = zio;
if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
pipeline |= ZIO_GANG_STAGES;
+ } else {
+ zio->io_logical = NULL;
+ zio->io_bp = NULL;
+ bzero(&zio->io_bp_copy, sizeof (blkptr_t));
+ bzero(&zio->io_bp_orig, sizeof (blkptr_t));
}
zio->io_spa = spa;
zio->io_txg = txg;
+ zio->io_ready = NULL;
zio->io_done = done;
zio->io_private = private;
+ zio->io_prev_space_delta = 0;
zio->io_type = type;
zio->io_priority = priority;
zio->io_vd = vd;
+ zio->io_vsd = NULL;
+ zio->io_vsd_ops = NULL;
zio->io_offset = offset;
+ zio->io_deadline = 0;
zio->io_orig_data = zio->io_data = data;
zio->io_orig_size = zio->io_size = size;
zio->io_orig_flags = zio->io_flags = flags;
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
+ bzero(&zio->io_prop, sizeof (zio_prop_t));
+ zio->io_cmd = 0;
+ zio->io_reexecute = 0;
+ zio->io_bp_override = NULL;
+ zio->io_walk_link = NULL;
+ zio->io_transform_stack = NULL;
+ zio->io_delay = 0;
+ zio->io_error = 0;
+ zio->io_child_count = 0;
+ zio->io_parent_count = 0;
+ zio->io_stall = NULL;
+ zio->io_gang_leader = NULL;
+ zio->io_gang_tree = NULL;
+ zio->io_executor = NULL;
+ zio->io_waiter = NULL;
+ zio->io_cksum_report = NULL;
+ zio->io_ena = 0;
+ bzero(zio->io_child_error, sizeof (int) * ZIO_CHILD_TYPES);
+ bzero(zio->io_children,
+ sizeof (uint64_t) * ZIO_CHILD_TYPES * ZIO_WAIT_TYPES);
+ bzero(&zio->io_bookmark, sizeof (zbookmark_t));
zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
static void
zio_destroy(zio_t *zio)
{
- list_destroy(&zio->io_parent_list);
- list_destroy(&zio->io_child_list);
- mutex_destroy(&zio->io_lock);
- cv_destroy(&zio->io_cv);
kmem_cache_free(zio_cache, zio);
}
ASSERT(txg > spa_syncing_txg(spa));
- if (use_slog)
+ /*
+ * ZIL blocks are always contiguous (i.e. not gang blocks) so we
+ * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
+ * when allocating them.
+ */
+ if (use_slog) {
error = metaslab_alloc(spa, spa_log_class(spa), size,
- new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
+ new_bp, 1, txg, old_bp,
+ METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
+ }
- if (error)
+ if (error) {
error = metaslab_alloc(spa, spa_normal_class(spa), size,
- new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
+ new_bp, 1, txg, old_bp,
+ METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
+ }
if (error == 0) {
BP_SET_LSIZE(new_bp, size);
align = 1ULL << vd->vdev_top->vdev_ashift;
- if (P2PHASE(zio->io_size, align) != 0) {
+ /*
+ * On Linux, we don't care about read alignment. The backing block
+ * device driver will take care of that for us.
+ * The only exception is raidz, which needs a full block for parity.
+ */
+ if (P2PHASE(zio->io_size, align) != 0 &&
+ (zio->io_type != ZIO_TYPE_READ ||
+ vd->vdev_ops == &vdev_raidz_ops)) {
uint64_t asize = P2ROUNDUP(zio->io_size, align);
char *abuf = zio_buf_alloc(asize);
- ASSERT(vd == vd->vdev_top);
if (zio->io_type == ZIO_TYPE_WRITE) {
bcopy(zio->io_data, abuf, zio->io_size);
bzero(abuf + zio->io_size, asize - zio->io_size);
}
zio_push_transform(zio, abuf, asize, asize, zio_subblock);
+ ASSERT(P2PHASE(zio->io_size, align) == 0);
}
ASSERT(P2PHASE(zio->io_offset, align) == 0);
- ASSERT(P2PHASE(zio->io_size, align) == 0);
VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
/*