X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev_queue.c;h=b2cc6b87ffd783ea927a563f308729c59ffc0ab0;hb=refs%2Fheads%2Frertzinger%2Ffeature-zpool-get--p;hp=46fca0e3b629f4730fed00840df3c001837f5514;hpb=172bb4bd5e4afef721dd4d2972d8680d983f144b;p=zfs.git diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 46fca0e..b2cc6b8 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -19,12 +19,15 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include -#include #include #include #include @@ -38,20 +41,27 @@ * of i/os pending to each device (before it starts ramping up to * max_pending). */ -int zfs_vdev_max_pending = 35; +int zfs_vdev_max_pending = 10; int zfs_vdev_min_pending = 4; -/* deadline = pri + (lbolt >> time_shift) */ -int zfs_vdev_time_shift = 6; +/* + * The deadlines are grouped into buckets based on zfs_vdev_time_shift: + * deadline = pri + gethrtime() >> time_shift) + */ +int zfs_vdev_time_shift = 29; /* each bucket is 0.537 seconds */ /* exponential I/O issue ramp-up rate */ int zfs_vdev_ramp_rate = 2; /* - * i/os will be aggregated into a single large i/o up to - * zfs_vdev_aggregation_limit bytes long. + * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. + * For read I/Os, we also aggregate across small adjacency gaps; for writes + * we include spans of optional I/Os to aid aggregation at the disk even when + * they aren't able to help us aggregate at this level. */ int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; +int zfs_vdev_read_gap_limit = 32 << 10; +int zfs_vdev_write_gap_limit = 4 << 10; /* * Virtual device vector for disk I/O scheduling. @@ -103,6 +113,7 @@ void vdev_queue_init(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; + int i; mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); @@ -117,18 +128,36 @@ vdev_queue_init(vdev_t *vd) avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); + + /* + * A list of buffers which can be used for aggregate I/O, this + * avoids the need to allocate them on demand when memory is low. + */ + list_create(&vq->vq_io_list, sizeof (vdev_io_t), + offsetof(vdev_io_t, vi_node)); + + for (i = 0; i < zfs_vdev_max_pending; i++) + list_insert_tail(&vq->vq_io_list, zio_vdev_alloc()); } void vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; + vdev_io_t *vi; avl_destroy(&vq->vq_deadline_tree); avl_destroy(&vq->vq_read_tree); avl_destroy(&vq->vq_write_tree); avl_destroy(&vq->vq_pending_tree); + while ((vi = list_head(&vq->vq_io_list)) != NULL) { + list_remove(&vq->vq_io_list, vi); + zio_vdev_free(vi); + } + + list_destroy(&vq->vq_io_list); + mutex_destroy(&vq->vq_lock); } @@ -149,34 +178,41 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) static void vdev_queue_agg_io_done(zio_t *aio) { - zio_t *dio; - uint64_t offset = 0; + vdev_queue_t *vq = &aio->io_vd->vdev_queue; + vdev_io_t *vi = aio->io_data; + zio_t *pio; - while ((dio = aio->io_delegate_list) != NULL) { + while ((pio = zio_walk_parents(aio)) != NULL) if (aio->io_type == ZIO_TYPE_READ) - bcopy((char *)aio->io_data + offset, dio->io_data, - dio->io_size); - offset += dio->io_size; - aio->io_delegate_list = dio->io_delegate_next; - dio->io_delegate_next = NULL; - dio->io_error = aio->io_error; - zio_execute(dio); - } - ASSERT3U(offset, ==, aio->io_size); + bcopy((char *)aio->io_data + (pio->io_offset - + aio->io_offset), pio->io_data, pio->io_size); - zio_buf_free(aio->io_data, aio->io_size); + mutex_enter(&vq->vq_lock); + list_insert_tail(&vq->vq_io_list, vi); + mutex_exit(&vq->vq_lock); } -#define IS_ADJACENT(io, nio) \ - ((io)->io_offset + (io)->io_size == (nio)->io_offset) +/* + * Compute the range spanned by two i/os, which is the endpoint of the last + * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). + * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); + * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. + */ +#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) +#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { - zio_t *fio, *lio, *aio, *dio; - avl_tree_t *tree; - uint64_t size; - + zio_t *fio, *lio, *aio, *dio, *nio, *mio; + avl_tree_t *t; + vdev_io_t *vi; + int flags; + uint64_t maxspan = MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE); + uint64_t maxgap; + int stretch; + +again: ASSERT(MUTEX_HELD(&vq->vq_lock)); if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || @@ -185,58 +221,159 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) fio = lio = avl_first(&vq->vq_deadline_tree); - tree = fio->io_vdev_tree; - size = fio->io_size; + t = fio->io_vdev_tree; + flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; + maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; - while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && - !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && - size + dio->io_size <= zfs_vdev_aggregation_limit) { - dio->io_delegate_next = fio; - fio = dio; - size += dio->io_size; + vi = list_head(&vq->vq_io_list); + if (vi == NULL) { + vi = zio_vdev_alloc(); + list_insert_head(&vq->vq_io_list, vi); } - while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && - !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && - size + dio->io_size <= zfs_vdev_aggregation_limit) { - lio->io_delegate_next = dio; - lio = dio; - size += dio->io_size; + if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { + /* + * We can aggregate I/Os that are sufficiently adjacent and of + * the same flavor, as expressed by the AGG_INHERIT flags. + * The latter requirement is necessary so that certain + * attributes of the I/O, such as whether it's a normal I/O + * or a scrub/resilver, can be preserved in the aggregate. + * We can include optional I/Os, but don't allow them + * to begin a range as they add no benefit in that situation. + */ + + /* + * We keep track of the last non-optional I/O. + */ + mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio; + + /* + * Walk backwards through sufficiently contiguous I/Os + * recording the last non-option I/O. + */ + while ((dio = AVL_PREV(t, fio)) != NULL && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + IO_SPAN(dio, lio) <= maxspan && + IO_GAP(dio, fio) <= maxgap) { + fio = dio; + if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL)) + mio = fio; + } + + /* + * Skip any initial optional I/Os. + */ + while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) { + fio = AVL_NEXT(t, fio); + ASSERT(fio != NULL); + } + + /* + * Walk forward through sufficiently contiguous I/Os. + */ + while ((dio = AVL_NEXT(t, lio)) != NULL && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + IO_SPAN(fio, dio) <= maxspan && + IO_GAP(lio, dio) <= maxgap) { + lio = dio; + if (!(lio->io_flags & ZIO_FLAG_OPTIONAL)) + mio = lio; + } + + /* + * Now that we've established the range of the I/O aggregation + * we must decide what to do with trailing optional I/Os. + * For reads, there's nothing to do. While we are unable to + * aggregate further, it's possible that a trailing optional + * I/O would allow the underlying device to aggregate with + * subsequent I/Os. We must therefore determine if the next + * non-optional I/O is close enough to make aggregation + * worthwhile. + */ + stretch = B_FALSE; + if (t != &vq->vq_read_tree && mio != NULL) { + nio = lio; + while ((dio = AVL_NEXT(t, nio)) != NULL && + IO_GAP(nio, dio) == 0 && + IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) { + nio = dio; + if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { + stretch = B_TRUE; + break; + } + } + } + + if (stretch) { + /* This may be a no-op. */ + VERIFY((dio = AVL_NEXT(t, lio)) != NULL); + dio->io_flags &= ~ZIO_FLAG_OPTIONAL; + } else { + while (lio != mio && lio != fio) { + ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL); + lio = AVL_PREV(t, lio); + ASSERT(lio != NULL); + } + } } if (fio != lio) { - char *buf = zio_buf_alloc(size); - uint64_t offset = 0; - - ASSERT(size <= zfs_vdev_aggregation_limit); + uint64_t size = IO_SPAN(fio, lio); + ASSERT(size <= maxspan); + ASSERT(vi != NULL); aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, - buf, size, fio->io_type, ZIO_PRIORITY_NOW, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, + vi, size, fio->io_type, ZIO_PRIORITY_AGG, + flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); + aio->io_timestamp = fio->io_timestamp; - aio->io_delegate_list = fio; - - for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { + nio = fio; + do { + dio = nio; + nio = AVL_NEXT(t, dio); ASSERT(dio->io_type == aio->io_type); - ASSERT(dio->io_vdev_tree == tree); - if (dio->io_type == ZIO_TYPE_WRITE) - bcopy(dio->io_data, buf + offset, dio->io_size); - offset += dio->io_size; + ASSERT(dio->io_vdev_tree == t); + + if (dio->io_flags & ZIO_FLAG_NODATA) { + ASSERT(dio->io_type == ZIO_TYPE_WRITE); + bzero((char *)aio->io_data + (dio->io_offset - + aio->io_offset), dio->io_size); + } else if (dio->io_type == ZIO_TYPE_WRITE) { + bcopy(dio->io_data, (char *)aio->io_data + + (dio->io_offset - aio->io_offset), + dio->io_size); + } + + zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); - } - - ASSERT(offset == size); + zio_execute(dio); + } while (dio != lio); avl_add(&vq->vq_pending_tree, aio); + list_remove(&vq->vq_io_list, vi); return (aio); } - ASSERT(fio->io_vdev_tree == tree); + ASSERT(fio->io_vdev_tree == t); vdev_queue_io_remove(vq, fio); + /* + * If the I/O is or was optional and therefore has no data, we need to + * simply discard it. We need to drop the vdev queue's lock to avoid a + * deadlock that we could encounter since this I/O will complete + * immediately. + */ + if (fio->io_flags & ZIO_FLAG_NODATA) { + mutex_exit(&vq->vq_lock); + zio_vdev_io_bypass(fio); + zio_execute(fio); + mutex_enter(&vq->vq_lock); + goto again; + } + avl_add(&vq->vq_pending_tree, fio); return (fio); @@ -262,7 +399,9 @@ vdev_queue_io(zio_t *zio) mutex_enter(&vq->vq_lock); - zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority; + zio->io_timestamp = gethrtime(); + zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + + zio->io_priority; vdev_queue_io_add(vq, zio); @@ -285,12 +424,20 @@ void vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; + int i; + + if (zio_injection_enabled) + delay(SEC_TO_TICK(zio_handle_io_delay(zio))); mutex_enter(&vq->vq_lock); avl_remove(&vq->vq_pending_tree, zio); - for (int i = 0; i < zfs_vdev_ramp_rate; i++) { + zio->io_delta = gethrtime() - zio->io_timestamp; + vq->vq_io_complete_ts = gethrtime(); + vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; + + for (i = 0; i < zfs_vdev_ramp_rate; i++) { zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); if (nio == NULL) break; @@ -306,3 +453,26 @@ vdev_queue_io_done(zio_t *zio) mutex_exit(&vq->vq_lock); } + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_vdev_max_pending, int, 0644); +MODULE_PARM_DESC(zfs_vdev_max_pending, "Max pending per-vdev I/Os"); + +module_param(zfs_vdev_min_pending, int, 0644); +MODULE_PARM_DESC(zfs_vdev_min_pending, "Min pending per-vdev I/Os"); + +module_param(zfs_vdev_aggregation_limit, int, 0644); +MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size"); + +module_param(zfs_vdev_time_shift, int, 0644); +MODULE_PARM_DESC(zfs_vdev_time_shift, "Deadline time shift for vdev I/O"); + +module_param(zfs_vdev_ramp_rate, int, 0644); +MODULE_PARM_DESC(zfs_vdev_ramp_rate, "Exponential I/O issue ramp-up rate"); + +module_param(zfs_vdev_read_gap_limit, int, 0644); +MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap"); + +module_param(zfs_vdev_write_gap_limit, int, 0644); +MODULE_PARM_DESC(zfs_vdev_write_gap_limit, "Aggregate write I/O over gap"); +#endif