X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fzio.c;h=bcfc081d1a01832ade32c7da8f5e61a738d897af;hb=9dcb97198338ba2d8764dd5604b278118612f74d;hp=c96442d0bd715af7b781ce6ca70ebdf80753a013;hpb=c8df41538db3723edf3536379a78fbaf68f037d0;p=zfs.git diff --git a/module/zfs/zio.c b/module/zfs/zio.c index c96442d..bcfc081 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -73,6 +73,7 @@ char *zio_type_name[ZIO_TYPES] = { */ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; +kmem_cache_t *zio_vdev_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; int zio_bulk_flags = 0; @@ -99,6 +100,35 @@ int zio_buf_debug_limit = 0; static inline void __zio_execute(zio_t *zio); +static int +zio_cons(void *arg, void *unused, int kmflag) +{ + zio_t *zio = arg; + + bzero(zio, sizeof (zio_t)); + + mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); + + list_create(&zio->io_parent_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_parent_node)); + list_create(&zio->io_child_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_child_node)); + + return (0); +} + +static void +zio_dest(void *arg, void *unused) +{ + zio_t *zio = arg; + + mutex_destroy(&zio->io_lock); + cv_destroy(&zio->io_cv); + list_destroy(&zio->io_parent_list); + list_destroy(&zio->io_child_list); +} + void zio_init(void) { @@ -108,10 +138,12 @@ zio_init(void) #ifdef _KERNEL data_alloc_arena = zio_alloc_arena; #endif - zio_cache = kmem_cache_create("zio_cache", - sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM); + zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, + zio_cons, zio_dest, NULL, NULL, NULL, KMC_KMEM); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM); + zio_vdev_cache = kmem_cache_create("zio_vdev_cache", sizeof(vdev_io_t), + PAGESIZE, NULL, NULL, NULL, NULL, NULL, KMC_VMEM); /* * For small buffers, we want a cache for each multiple of @@ -201,6 +233,7 @@ zio_fini(void) zio_data_buf_cache[c] = NULL; } + kmem_cache_destroy(zio_vdev_cache); kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); @@ -226,7 +259,7 @@ zio_buf_alloc(size_t size) ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); + return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE | KM_NODEBUG)); } /* @@ -242,7 +275,8 @@ zio_data_buf_alloc(size_t size) ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); + return (kmem_cache_alloc(zio_data_buf_cache[c], + KM_PUSHPAGE | KM_NODEBUG)); } void @@ -266,6 +300,24 @@ zio_data_buf_free(void *buf, size_t size) } /* + * Dedicated I/O buffers to ensure that memory fragmentation never prevents + * or significantly delays the issuing of a zio. These buffers are used + * to aggregate I/O and could be used for raidz stripes. + */ +void * +zio_vdev_alloc(void) +{ + return (kmem_cache_alloc(zio_vdev_cache, KM_PUSHPAGE)); +} + +void +zio_vdev_free(void *buf) +{ + kmem_cache_free(zio_vdev_cache, buf); + +} + +/* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== @@ -511,15 +563,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT(vd || stage == ZIO_STAGE_OPEN); zio = kmem_cache_alloc(zio_cache, KM_PUSHPAGE); - bzero(zio, sizeof (zio_t)); - - mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); - - list_create(&zio->io_parent_list, sizeof (zio_link_t), - offsetof(zio_link_t, zl_parent_node)); - list_create(&zio->io_child_list, sizeof (zio_link_t), - offsetof(zio_link_t, zl_child_node)); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; @@ -531,6 +574,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { + zio->io_logical = NULL; zio->io_bp = (blkptr_t *)bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; @@ -541,21 +585,52 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; + } else { + zio->io_logical = NULL; + zio->io_bp = NULL; + bzero(&zio->io_bp_copy, sizeof (blkptr_t)); + bzero(&zio->io_bp_orig, sizeof (blkptr_t)); } zio->io_spa = spa; zio->io_txg = txg; + zio->io_ready = NULL; zio->io_done = done; zio->io_private = private; + zio->io_prev_space_delta = 0; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; + zio->io_vsd = NULL; + zio->io_vsd_ops = NULL; zio->io_offset = offset; + zio->io_deadline = 0; zio->io_orig_data = zio->io_data = data; zio->io_orig_size = zio->io_size = size; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; + bzero(&zio->io_prop, sizeof (zio_prop_t)); + zio->io_cmd = 0; + zio->io_reexecute = 0; + zio->io_bp_override = NULL; + zio->io_walk_link = NULL; + zio->io_transform_stack = NULL; + zio->io_delay = 0; + zio->io_error = 0; + zio->io_child_count = 0; + zio->io_parent_count = 0; + zio->io_stall = NULL; + zio->io_gang_leader = NULL; + zio->io_gang_tree = NULL; + zio->io_executor = NULL; + zio->io_waiter = NULL; + zio->io_cksum_report = NULL; + zio->io_ena = 0; + bzero(zio->io_child_error, sizeof (int) * ZIO_CHILD_TYPES); + bzero(zio->io_children, + sizeof (uint64_t) * ZIO_CHILD_TYPES * ZIO_WAIT_TYPES); + bzero(&zio->io_bookmark, sizeof (zbookmark_t)); zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); @@ -579,10 +654,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, static void zio_destroy(zio_t *zio) { - list_destroy(&zio->io_parent_list); - list_destroy(&zio->io_child_list); - mutex_destroy(&zio->io_lock); - cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } @@ -1234,18 +1305,34 @@ __zio_execute(zio_t *zio) int zio_wait(zio_t *zio) { + uint64_t timeout; int error; ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_executor == NULL); zio->io_waiter = curthread; + timeout = ddi_get_lbolt() + (zio_delay_max / MILLISEC * hz); __zio_execute(zio); mutex_enter(&zio->io_lock); - while (zio->io_executor != NULL) - cv_wait(&zio->io_cv, &zio->io_lock); + while (zio->io_executor != NULL) { + /* + * Wake up periodically to prevent the kernel from complaining + * about a blocked task. However, check zio_delay_max to see + * if the I/O has exceeded the timeout and post an ereport. + */ + cv_timedwait_interruptible(&zio->io_cv, &zio->io_lock, + ddi_get_lbolt() + hz); + + if (timeout && (ddi_get_lbolt() > timeout)) { + zio->io_delay = zio_delay_max; + zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + zio->io_spa, zio->io_vd, zio, 0, 0); + timeout = 0; + } + } mutex_exit(&zio->io_lock); error = zio->io_error; @@ -1790,6 +1877,11 @@ zio_write_gang_block(zio_t *pio) */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + /* + * We didn't allocate this bp, so make sure it doesn't get unmarked. + */ + pio->io_flags &= ~ZIO_FLAG_FASTWRITE; + zio_nowait(zio); return (ZIO_PIPELINE_CONTINUE); @@ -2199,6 +2291,7 @@ zio_dva_allocate(zio_t *zio) flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? METASLAB_GANG_CHILD : 0; + flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags); @@ -2262,20 +2355,29 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int -zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, - uint64_t size, boolean_t use_slog) +zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, + boolean_t use_slog) { int error = 1; ASSERT(txg > spa_syncing_txg(spa)); - if (use_slog) + /* + * ZIL blocks are always contiguous (i.e. not gang blocks) so we + * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" + * when allocating them. + */ + if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, - new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); + new_bp, 1, txg, NULL, + METASLAB_FASTWRITE | METASLAB_GANG_AVOID); + } - if (error) + if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); + new_bp, 1, txg, NULL, + METASLAB_FASTWRITE | METASLAB_GANG_AVOID); + } if (error == 0) { BP_SET_LSIZE(new_bp, size); @@ -2803,15 +2905,11 @@ zio_done(zio_t *zio) vdev_stat_update(zio, zio->io_size); /* - * If this I/O is attached to a particular vdev is slow, exeeding - * 30 seconds to complete, post an error described the I/O delay. - * We ignore these errors if the device is currently unavailable. + * When an I/O completes but was slow post an ereport. */ - if (zio->io_delay >= zio_delay_max) { - if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) - zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, - zio->io_vd, zio, 0, 0); - } + if (zio->io_delay >= zio_delay_max) + zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, + zio->io_vd, zio, 0, 0); if (zio->io_error) { /* @@ -2980,6 +3078,11 @@ zio_done(zio_t *zio) zfs_ereport_free_checksum(zcr); } + if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && + !BP_IS_HOLE(zio->io_bp)) { + metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); + } + /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as