X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fzio.c;h=6b03be6f3bbf62e8d7c442e145e8bd5fd8ca90cb;hb=591fb62f19ee2431983a4cbeb0d200b1b8e7daf5;hp=0bf0f6e127c9b0210599f5b2f6a20f9a027d1751;hpb=d6320ddb78fa89c4d0fc2af00ae53c7c70992f96;p=zfs.git diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 0bf0f6e..6b03be6 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include @@ -61,8 +62,7 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { * ========================================================================== */ char *zio_type_name[ZIO_TYPES] = { - "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", - "zio_ioctl" + "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl" }; /* @@ -74,10 +74,13 @@ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +int zio_bulk_flags = 0; +int zio_delay_max = ZIO_DELAY_MAX; #ifdef _KERNEL extern vmem_t *zio_alloc_arena; #endif +extern int zfs_mg_alloc_failures; /* * An allocating zio is one that either currently has the DVA allocate @@ -85,7 +88,7 @@ extern vmem_t *zio_alloc_arena; */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) -boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; +int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG int zio_buf_debug_limit = 16384; @@ -93,6 +96,8 @@ int zio_buf_debug_limit = 16384; int zio_buf_debug_limit = 0; #endif +static inline void __zio_execute(zio_t *zio); + void zio_init(void) { @@ -103,9 +108,9 @@ zio_init(void) data_alloc_arena = zio_alloc_arena; #endif zio_cache = kmem_cache_create("zio_cache", - sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM); zio_link_cache = kmem_cache_create("zio_link_cache", - sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM); /* * For small buffers, we want a cache for each multiple of @@ -131,15 +136,27 @@ zio_init(void) if (align != 0) { char name[36]; + int flags = zio_bulk_flags; + + /* + * The smallest buffers (512b) are heavily used and + * experience a lot of churn. The slabs allocated + * for them are also relatively small (32K). Thus + * in over to avoid expensive calls to vmalloc() we + * make an exception to the usual slab allocation + * policy and force these buffers to be kmem backed. + */ + if (size == (1 << SPA_MINBLOCKSHIFT)) + flags |= KMC_KMEM; + (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, - size > zio_buf_debug_limit ? KMC_NODEBUG : 0); + align, NULL, NULL, NULL, NULL, NULL, flags); (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, data_alloc_arena, - size > zio_buf_debug_limit ? KMC_NODEBUG : 0); + align, NULL, NULL, NULL, NULL, + data_alloc_arena, flags); } } @@ -153,6 +170,12 @@ zio_init(void) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } + /* + * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs + * to fail 3 times per txg or 8 failures, whichever is greater. + */ + zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); + zio_inject_init(); } @@ -250,7 +273,7 @@ static void zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { - zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); + zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_PUSHPAGE); zt->zt_orig_data = zio->io_data; zt->zt_orig_size = zio->io_size; @@ -365,7 +388,7 @@ zio_unique_parent(zio_t *cio) void zio_add_child(zio_t *pio, zio_t *cio) { - zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_PUSHPAGE); int w; /* @@ -436,7 +459,8 @@ zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) return (waiting); } -static void +__attribute__((always_inline)) +static inline void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; @@ -450,7 +474,7 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) if (--*countp == 0 && pio->io_stall == countp) { pio->io_stall = NULL; mutex_exit(&pio->io_lock); - zio_execute(pio); + __zio_execute(pio); } else { mutex_exit(&pio->io_lock); } @@ -485,7 +509,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); - zio = kmem_cache_alloc(zio_cache, KM_SLEEP); + zio = kmem_cache_alloc(zio_cache, KM_PUSHPAGE); bzero(zio, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); @@ -1049,7 +1073,7 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; - int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); + int flags = TQ_NOSLEEP | (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and @@ -1073,8 +1097,9 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); - (void) taskq_dispatch(spa->spa_zio_taskq[t][q], - (task_func_t *)zio_execute, zio, flags); + + while (taskq_dispatch(spa->spa_zio_taskq[t][q], + (task_func_t *)zio_execute, zio, flags) == 0); /* do nothing */ } static boolean_t @@ -1120,14 +1145,31 @@ zio_interrupt(zio_t *zio) */ static zio_pipe_stage_t *zio_pipeline[]; +/* + * zio_execute() is a wrapper around the static function + * __zio_execute() so that we can force __zio_execute() to be + * inlined. This reduces stack overhead which is important + * because __zio_execute() is called recursively in several zio + * code paths. zio_execute() itself cannot be inlined because + * it is externally visible. + */ void zio_execute(zio_t *zio) { + __zio_execute(zio); +} + +__attribute__((always_inline)) +static inline void +__zio_execute(zio_t *zio) +{ zio->io_executor = curthread; while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; + dsl_pool_t *dsl; + boolean_t cut; int rv; ASSERT(!MUTEX_HELD(&zio->io_lock)); @@ -1140,19 +1182,26 @@ zio_execute(zio_t *zio) ASSERT(stage <= ZIO_STAGE_DONE); + dsl = spa_get_dsl(zio->io_spa); + cut = (stage == ZIO_STAGE_VDEV_IO_START) ? + zio_requeue_io_start_cut_in_line : B_FALSE; + /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * + * If we are in the txg_sync_thread or being called + * during pool init issue async to minimize stack depth. + * Both of these call paths may be recursively called. + * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ - if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && - zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { - boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? - zio_requeue_io_start_cut_in_line : B_FALSE; + if (((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && + zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) || + (dsl != NULL && dsl_pool_sync_context(dsl))) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } @@ -1167,6 +1216,7 @@ zio_execute(zio_t *zio) } } + /* * ========================================================================== * Initiate I/O, either sync or async @@ -1182,7 +1232,7 @@ zio_wait(zio_t *zio) zio->io_waiter = curthread; - zio_execute(zio); + __zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) @@ -1212,7 +1262,7 @@ zio_nowait(zio_t *zio) zio_add_child(spa->spa_async_zio_root, zio); } - zio_execute(zio); + __zio_execute(zio); } /* @@ -1267,7 +1317,7 @@ zio_reexecute(zio_t *pio) * responsibility of the caller to wait on him. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) - zio_execute(pio); + __zio_execute(pio); } void @@ -1480,7 +1530,7 @@ zio_gang_node_alloc(zio_gang_node_t **gnpp) ASSERT(*gnpp == NULL); - gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); + gn = kmem_zalloc(sizeof (*gn), KM_PUSHPAGE); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; @@ -1634,7 +1684,7 @@ static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); - zio_t *gio = zio->io_gang_leader; + ASSERTV(zio_t *gio = zio->io_gang_leader;) dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; @@ -1949,12 +1999,12 @@ static void zio_ddt_ditto_write_done(zio_t *zio) { int p = DDT_PHYS_DITTO; - zio_prop_t *zp = &zio->io_prop; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_key_t *ddk = &dde->dde_key; + ASSERTV(zio_prop_t *zp = &zio->io_prop); ddt_enter(ddt); @@ -2119,6 +2169,7 @@ zio_dva_allocate(zio_t *zio) metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = zio->io_bp; int error; + int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); @@ -2131,10 +2182,21 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); + /* + * The dump device does not support gang blocks so allocation on + * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid + * the "fast" gang feature. + */ + flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; + flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? + METASLAB_GANG_CHILD : 0; error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_copies, zio->io_txg, NULL, 0); + zio->io_prop.zp_copies, zio->io_txg, NULL, flags); if (error) { + spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " + "size %llu, error %d", spa_name(spa), zio, zio->io_size, + error); if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) return (zio_write_gang_block(zio)); zio->io_error = error; @@ -2661,11 +2723,6 @@ zio_ready(zio_t *zio) static int zio_done(zio_t *zio) { - spa_t *spa = zio->io_spa; - zio_t *lio = zio->io_logical; - blkptr_t *bp = zio->io_bp; - vdev_t *vd = zio->io_vd; - uint64_t psize = zio->io_size; zio_t *pio, *pio_next; int c, w; @@ -2683,18 +2740,18 @@ zio_done(zio_t *zio) for (w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); - if (bp != NULL) { - ASSERT(bp->blk_pad[0] == 0); - ASSERT(bp->blk_pad[1] == 0); - ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || - (bp == zio_unique_parent(zio)->io_bp)); - if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && + if (zio->io_bp != NULL) { + ASSERT(zio->io_bp->blk_pad[0] == 0); + ASSERT(zio->io_bp->blk_pad[1] == 0); + ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || + (zio->io_bp == zio_unique_parent(zio)->io_bp)); + if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { - ASSERT(!BP_SHOULD_BYTESWAP(bp)); - ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); - ASSERT(BP_COUNT_GANG(bp) == 0 || - (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); + ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp)); + ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); + ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || + (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); } } @@ -2713,13 +2770,13 @@ zio_done(zio_t *zio) while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; - uint64_t asize = P2ROUNDUP(psize, align); + uint64_t asize = P2ROUNDUP(zio->io_size, align); char *abuf = zio->io_data; - if (asize != psize) { + if (asize != zio->io_size) { abuf = zio_buf_alloc(asize); - bcopy(zio->io_data, abuf, psize); - bzero(abuf + psize, asize - psize); + bcopy(zio->io_data, abuf, zio->io_size); + bzero(abuf + zio->io_size, asize - zio->io_size); } zio->io_cksum_report = zcr->zcr_next; @@ -2727,14 +2784,25 @@ zio_done(zio_t *zio) zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); - if (asize != psize) + if (asize != zio->io_size) zio_buf_free(abuf, asize); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ - vdev_stat_update(zio, psize); + vdev_stat_update(zio, zio->io_size); + + /* + * If this I/O is attached to a particular vdev is slow, exeeding + * 30 seconds to complete, post an error described the I/O delay. + * We ignore these errors if the device is currently unavailable. + */ + if (zio->io_delay >= zio_delay_max) { + if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) + zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, + zio->io_vd, zio, 0, 0); + } if (zio->io_error) { /* @@ -2743,28 +2811,30 @@ zio_done(zio_t *zio) * at the block level. We ignore these errors if the * device is currently unavailable. */ - if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) - zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); + if (zio->io_error != ECKSUM && zio->io_vd != NULL && + !vdev_is_dead(zio->io_vd)) + zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, + zio->io_vd, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && - zio == lio) { + zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ - spa_log_error(spa, zio); - zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, + spa_log_error(zio->io_spa, zio); + zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, zio, 0, 0); } } - if (zio->io_error && zio == lio) { + if (zio->io_error && zio == zio->io_logical) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ - ASSERT(vd == NULL && bp != NULL); + ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && @@ -2779,8 +2849,8 @@ zio_done(zio_t *zio) zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && - spa_load_state(spa) == SPA_LOAD_NONE && - spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) + spa_load_state(zio->io_spa) == SPA_LOAD_NONE && + spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) @@ -2806,7 +2876,7 @@ zio_done(zio_t *zio) if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) - zio_dva_unallocate(zio, zio->io_gang_tree, bp); + zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); zio_gang_tree_free(&zio->io_gang_tree); @@ -2871,14 +2941,14 @@ zio_done(zio_t *zio) * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ - zio_suspend(spa, zio); + zio_suspend(zio->io_spa, zio); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ (void) taskq_dispatch( - spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], + zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], (task_func_t *)zio_reexecute, zio, TQ_SLEEP); } return (ZIO_PIPELINE_STOP); @@ -2958,3 +3028,25 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_checksum_verify, zio_done }; + +#if defined(_KERNEL) && defined(HAVE_SPL) +/* Fault injection */ +EXPORT_SYMBOL(zio_injection_enabled); +EXPORT_SYMBOL(zio_inject_fault); +EXPORT_SYMBOL(zio_inject_list_next); +EXPORT_SYMBOL(zio_clear_fault); +EXPORT_SYMBOL(zio_handle_fault_injection); +EXPORT_SYMBOL(zio_handle_device_injection); +EXPORT_SYMBOL(zio_handle_label_injection); +EXPORT_SYMBOL(zio_priority_table); +EXPORT_SYMBOL(zio_type_name); + +module_param(zio_bulk_flags, int, 0644); +MODULE_PARM_DESC(zio_bulk_flags, "Additional flags to pass to bulk buffers"); + +module_param(zio_delay_max, int, 0644); +MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event"); + +module_param(zio_requeue_io_start_cut_in_line, int, 0644); +MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O"); +#endif