X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev_queue.c;fp=module%2Fzfs%2Fvdev_queue.c;h=9867d09704d03de3a9b8d5cb8b651ae054f70f5f;hb=45d1cae3b8c949ecc391dd7a5b81963b34c71c29;hp=5e57a15132e30708aca6e41a8b502c10dce96e20;hpb=9babb37438b58e77bad04e820d5702e15b79e6a6;p=zfs.git diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 5e57a15..9867d09 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -24,7 +24,7 @@ */ #include -#include +#include #include #include #include @@ -48,11 +48,14 @@ int zfs_vdev_time_shift = 6; int zfs_vdev_ramp_rate = 2; /* - * To reduce IOPs, we aggregate small adjacent i/os into one large i/o. - * For read i/os, we also aggregate across small adjacency gaps. + * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. + * For read I/Os, we also aggregate across small adjacency gaps; for writes + * we include spans of optional I/Os to aid aggregation at the disk even when + * they aren't able to help us aggregate at this level. */ int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; int zfs_vdev_read_gap_limit = 32 << 10; +int zfs_vdev_write_gap_limit = 4 << 10; /* * Virtual device vector for disk I/O scheduling. @@ -172,12 +175,14 @@ vdev_queue_agg_io_done(zio_t *aio) static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { - zio_t *fio, *lio, *aio, *dio, *nio; + zio_t *fio, *lio, *aio, *dio, *nio, *mio; avl_tree_t *t; int flags; uint64_t maxspan = zfs_vdev_aggregation_limit; uint64_t maxgap; + int stretch; +again: ASSERT(MUTEX_HELD(&vq->vq_lock)); if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || @@ -192,21 +197,88 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { /* - * We can aggregate I/Os that are adjacent and of the - * same flavor, as expressed by the AGG_INHERIT flags. - * The latter is necessary so that certain attributes - * of the I/O, such as whether it's a normal I/O or a - * scrub/resilver, can be preserved in the aggregate. + * We can aggregate I/Os that are sufficiently adjacent and of + * the same flavor, as expressed by the AGG_INHERIT flags. + * The latter requirement is necessary so that certain + * attributes of the I/O, such as whether it's a normal I/O + * or a scrub/resilver, can be preserved in the aggregate. + * We can include optional I/Os, but don't allow them + * to begin a range as they add no benefit in that situation. + */ + + /* + * We keep track of the last non-optional I/O. + */ + mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio; + + /* + * Walk backwards through sufficiently contiguous I/Os + * recording the last non-option I/O. */ while ((dio = AVL_PREV(t, fio)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap) + IO_SPAN(dio, lio) <= maxspan && + IO_GAP(dio, fio) <= maxgap) { fio = dio; + if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL)) + mio = fio; + } + /* + * Skip any initial optional I/Os. + */ + while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) { + fio = AVL_NEXT(t, fio); + ASSERT(fio != NULL); + } + + /* + * Walk forward through sufficiently contiguous I/Os. + */ while ((dio = AVL_NEXT(t, lio)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap) + IO_SPAN(fio, dio) <= maxspan && + IO_GAP(lio, dio) <= maxgap) { lio = dio; + if (!(lio->io_flags & ZIO_FLAG_OPTIONAL)) + mio = lio; + } + + /* + * Now that we've established the range of the I/O aggregation + * we must decide what to do with trailing optional I/Os. + * For reads, there's nothing to do. While we are unable to + * aggregate further, it's possible that a trailing optional + * I/O would allow the underlying device to aggregate with + * subsequent I/Os. We must therefore determine if the next + * non-optional I/O is close enough to make aggregation + * worthwhile. + */ + stretch = B_FALSE; + if (t != &vq->vq_read_tree && mio != NULL) { + nio = lio; + while ((dio = AVL_NEXT(t, nio)) != NULL && + IO_GAP(nio, dio) == 0 && + IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) { + nio = dio; + if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { + stretch = B_TRUE; + break; + } + } + } + + if (stretch) { + /* This may be a no-op. */ + VERIFY((dio = AVL_NEXT(t, lio)) != NULL); + dio->io_flags &= ~ZIO_FLAG_OPTIONAL; + } else { + while (lio != mio && lio != fio) { + ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL); + lio = AVL_PREV(t, lio); + ASSERT(lio != NULL); + } + } } if (fio != lio) { @@ -225,10 +297,15 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) ASSERT(dio->io_type == aio->io_type); ASSERT(dio->io_vdev_tree == t); - if (dio->io_type == ZIO_TYPE_WRITE) + if (dio->io_flags & ZIO_FLAG_NODATA) { + ASSERT(dio->io_type == ZIO_TYPE_WRITE); + bzero((char *)aio->io_data + (dio->io_offset - + aio->io_offset), dio->io_size); + } else if (dio->io_type == ZIO_TYPE_WRITE) { bcopy(dio->io_data, (char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); + } zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); @@ -244,6 +321,20 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) ASSERT(fio->io_vdev_tree == t); vdev_queue_io_remove(vq, fio); + /* + * If the I/O is or was optional and therefore has no data, we need to + * simply discard it. We need to drop the vdev queue's lock to avoid a + * deadlock that we could encounter since this I/O will complete + * immediately. + */ + if (fio->io_flags & ZIO_FLAG_NODATA) { + mutex_exit(&vq->vq_lock); + zio_vdev_io_bypass(fio); + zio_execute(fio); + mutex_enter(&vq->vq_lock); + goto again; + } + avl_add(&vq->vq_pending_tree, fio); return (fio);