X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fvdev_queue.c;h=b2cc6b87ffd783ea927a563f308729c59ffc0ab0;hb=cb682a173a84813b2aeb5d18f58cff1a07531fb3;hp=5a0d3ee97029d7f4016cadadf0fa3c446bb5ccb8;hpb=428870ff734fdaccc342b33fc53cf94724409a46;p=zfs.git diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 5a0d3ee..b2cc6b8 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include #include #include @@ -40,8 +44,11 @@ int zfs_vdev_max_pending = 10; int zfs_vdev_min_pending = 4; -/* deadline = pri + ddi_get_lbolt64() >> time_shift) */ -int zfs_vdev_time_shift = 6; +/* + * The deadlines are grouped into buckets based on zfs_vdev_time_shift: + * deadline = pri + gethrtime() >> time_shift) + */ +int zfs_vdev_time_shift = 29; /* each bucket is 0.537 seconds */ /* exponential I/O issue ramp-up rate */ int zfs_vdev_ramp_rate = 2; @@ -106,6 +113,7 @@ void vdev_queue_init(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; + int i; mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); @@ -120,18 +128,36 @@ vdev_queue_init(vdev_t *vd) avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); + + /* + * A list of buffers which can be used for aggregate I/O, this + * avoids the need to allocate them on demand when memory is low. + */ + list_create(&vq->vq_io_list, sizeof (vdev_io_t), + offsetof(vdev_io_t, vi_node)); + + for (i = 0; i < zfs_vdev_max_pending; i++) + list_insert_tail(&vq->vq_io_list, zio_vdev_alloc()); } void vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; + vdev_io_t *vi; avl_destroy(&vq->vq_deadline_tree); avl_destroy(&vq->vq_read_tree); avl_destroy(&vq->vq_write_tree); avl_destroy(&vq->vq_pending_tree); + while ((vi = list_head(&vq->vq_io_list)) != NULL) { + list_remove(&vq->vq_io_list, vi); + zio_vdev_free(vi); + } + + list_destroy(&vq->vq_io_list); + mutex_destroy(&vq->vq_lock); } @@ -152,6 +178,8 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) static void vdev_queue_agg_io_done(zio_t *aio) { + vdev_queue_t *vq = &aio->io_vd->vdev_queue; + vdev_io_t *vi = aio->io_data; zio_t *pio; while ((pio = zio_walk_parents(aio)) != NULL) @@ -159,7 +187,9 @@ vdev_queue_agg_io_done(zio_t *aio) bcopy((char *)aio->io_data + (pio->io_offset - aio->io_offset), pio->io_data, pio->io_size); - zio_buf_free(aio->io_data, aio->io_size); + mutex_enter(&vq->vq_lock); + list_insert_tail(&vq->vq_io_list, vi); + mutex_exit(&vq->vq_lock); } /* @@ -176,8 +206,9 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { zio_t *fio, *lio, *aio, *dio, *nio, *mio; avl_tree_t *t; + vdev_io_t *vi; int flags; - uint64_t maxspan = zfs_vdev_aggregation_limit; + uint64_t maxspan = MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE); uint64_t maxgap; int stretch; @@ -194,6 +225,12 @@ again: flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; + vi = list_head(&vq->vq_io_list); + if (vi == NULL) { + vi = zio_vdev_alloc(); + list_insert_head(&vq->vq_io_list, vi); + } + if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { /* * We can aggregate I/Os that are sufficiently adjacent and of @@ -282,12 +319,14 @@ again: if (fio != lio) { uint64_t size = IO_SPAN(fio, lio); - ASSERT(size <= zfs_vdev_aggregation_limit); + ASSERT(size <= maxspan); + ASSERT(vi != NULL); aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, - zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG, + vi, size, fio->io_type, ZIO_PRIORITY_AGG, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); + aio->io_timestamp = fio->io_timestamp; nio = fio; do { @@ -313,6 +352,7 @@ again: } while (dio != lio); avl_add(&vq->vq_pending_tree, aio); + list_remove(&vq->vq_io_list, vi); return (aio); } @@ -359,7 +399,8 @@ vdev_queue_io(zio_t *zio) mutex_enter(&vq->vq_lock); - zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) + + zio->io_timestamp = gethrtime(); + zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + zio->io_priority; vdev_queue_io_add(vq, zio); @@ -383,12 +424,20 @@ void vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; + int i; + + if (zio_injection_enabled) + delay(SEC_TO_TICK(zio_handle_io_delay(zio))); mutex_enter(&vq->vq_lock); avl_remove(&vq->vq_pending_tree, zio); - for (int i = 0; i < zfs_vdev_ramp_rate; i++) { + zio->io_delta = gethrtime() - zio->io_timestamp; + vq->vq_io_complete_ts = gethrtime(); + vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; + + for (i = 0; i < zfs_vdev_ramp_rate; i++) { zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); if (nio == NULL) break; @@ -404,3 +453,26 @@ vdev_queue_io_done(zio_t *zio) mutex_exit(&vq->vq_lock); } + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_vdev_max_pending, int, 0644); +MODULE_PARM_DESC(zfs_vdev_max_pending, "Max pending per-vdev I/Os"); + +module_param(zfs_vdev_min_pending, int, 0644); +MODULE_PARM_DESC(zfs_vdev_min_pending, "Min pending per-vdev I/Os"); + +module_param(zfs_vdev_aggregation_limit, int, 0644); +MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size"); + +module_param(zfs_vdev_time_shift, int, 0644); +MODULE_PARM_DESC(zfs_vdev_time_shift, "Deadline time shift for vdev I/O"); + +module_param(zfs_vdev_ramp_rate, int, 0644); +MODULE_PARM_DESC(zfs_vdev_ramp_rate, "Exponential I/O issue ramp-up rate"); + +module_param(zfs_vdev_read_gap_limit, int, 0644); +MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap"); + +module_param(zfs_vdev_write_gap_limit, int, 0644); +MODULE_PARM_DESC(zfs_vdev_write_gap_limit, "Aggregate write I/O over gap"); +#endif