X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fzio.c;h=e93b7739f28b5f0d19b44f619b3561723b76d96a;hb=c99c90015ece64746e20b74245caca41d1dbefe1;hp=c7eb66103952a39c9579f05e6a546e7a2c0efef9;hpb=da6b4005c92ce02359aabae6549f1031ad1fe41c;p=zfs.git diff --git a/module/zfs/zio.c b/module/zfs/zio.c index c7eb661..e93b773 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. */ #include @@ -61,8 +63,7 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { * ========================================================================== */ char *zio_type_name[ZIO_TYPES] = { - "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", - "zio_ioctl" + "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl" }; /* @@ -72,12 +73,32 @@ char *zio_type_name[ZIO_TYPES] = { */ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; +kmem_cache_t *zio_vdev_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +int zio_bulk_flags = 0; +int zio_delay_max = ZIO_DELAY_MAX; #ifdef _KERNEL extern vmem_t *zio_alloc_arena; #endif +extern int zfs_mg_alloc_failures; + +/* + * The following actions directly effect the spa's sync-to-convergence logic. + * The values below define the sync pass when we start performing the action. + * Care should be taken when changing these values as they directly impact + * spa_sync() performance. Tuning these values may introduce subtle performance + * pathologies and should only be done in the context of performance analysis. + * These tunables will eventually be removed and replaced with #defines once + * enough analysis has been done to determine optimal values. + * + * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that + * regular blocks are not deferred. + */ +int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ +int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ +int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ /* * An allocating zio is one that either currently has the DVA allocate @@ -85,7 +106,7 @@ extern vmem_t *zio_alloc_arena; */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) -boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; +int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG int zio_buf_debug_limit = 16384; @@ -95,6 +116,35 @@ int zio_buf_debug_limit = 0; static inline void __zio_execute(zio_t *zio); +static int +zio_cons(void *arg, void *unused, int kmflag) +{ + zio_t *zio = arg; + + bzero(zio, sizeof (zio_t)); + + mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); + + list_create(&zio->io_parent_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_parent_node)); + list_create(&zio->io_child_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_child_node)); + + return (0); +} + +static void +zio_dest(void *arg, void *unused) +{ + zio_t *zio = arg; + + mutex_destroy(&zio->io_lock); + cv_destroy(&zio->io_cv); + list_destroy(&zio->io_parent_list); + list_destroy(&zio->io_child_list); +} + void zio_init(void) { @@ -104,10 +154,12 @@ zio_init(void) #ifdef _KERNEL data_alloc_arena = zio_alloc_arena; #endif - zio_cache = kmem_cache_create("zio_cache", - sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, + zio_cons, zio_dest, NULL, NULL, NULL, KMC_KMEM); zio_link_cache = kmem_cache_create("zio_link_cache", - sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM); + zio_vdev_cache = kmem_cache_create("zio_vdev_cache", sizeof(vdev_io_t), + PAGESIZE, NULL, NULL, NULL, NULL, NULL, KMC_VMEM); /* * For small buffers, we want a cache for each multiple of @@ -133,15 +185,27 @@ zio_init(void) if (align != 0) { char name[36]; + int flags = zio_bulk_flags; + + /* + * The smallest buffers (512b) are heavily used and + * experience a lot of churn. The slabs allocated + * for them are also relatively small (32K). Thus + * in over to avoid expensive calls to vmalloc() we + * make an exception to the usual slab allocation + * policy and force these buffers to be kmem backed. + */ + if (size == (1 << SPA_MINBLOCKSHIFT)) + flags |= KMC_KMEM; + (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, - size > zio_buf_debug_limit ? KMC_NODEBUG : 0); + align, NULL, NULL, NULL, NULL, NULL, flags); (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, data_alloc_arena, - size > zio_buf_debug_limit ? KMC_NODEBUG : 0); + align, NULL, NULL, NULL, NULL, + data_alloc_arena, flags); } } @@ -155,7 +219,15 @@ zio_init(void) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } + /* + * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs + * to fail 3 times per txg or 8 failures, whichever is greater. + */ + zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); + zio_inject_init(); + + lz4_init(); } void @@ -179,10 +251,13 @@ zio_fini(void) zio_data_buf_cache[c] = NULL; } + kmem_cache_destroy(zio_vdev_cache); kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); + + lz4_fini(); } /* @@ -204,7 +279,7 @@ zio_buf_alloc(size_t size) ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); + return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE | KM_NODEBUG)); } /* @@ -220,7 +295,8 @@ zio_data_buf_alloc(size_t size) ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); + return (kmem_cache_alloc(zio_data_buf_cache[c], + KM_PUSHPAGE | KM_NODEBUG)); } void @@ -244,6 +320,24 @@ zio_data_buf_free(void *buf, size_t size) } /* + * Dedicated I/O buffers to ensure that memory fragmentation never prevents + * or significantly delays the issuing of a zio. These buffers are used + * to aggregate I/O and could be used for raidz stripes. + */ +void * +zio_vdev_alloc(void) +{ + return (kmem_cache_alloc(zio_vdev_cache, KM_PUSHPAGE)); +} + +void +zio_vdev_free(void *buf) +{ + kmem_cache_free(zio_vdev_cache, buf); + +} + +/* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== @@ -252,7 +346,7 @@ static void zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { - zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); + zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_PUSHPAGE); zt->zt_orig_data = zio->io_data; zt->zt_orig_size = zio->io_size; @@ -367,7 +461,7 @@ zio_unique_parent(zio_t *cio) void zio_add_child(zio_t *pio, zio_t *cio) { - zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_PUSHPAGE); int w; /* @@ -488,16 +582,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); - zio = kmem_cache_alloc(zio_cache, KM_SLEEP); - bzero(zio, sizeof (zio_t)); - - mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); - - list_create(&zio->io_parent_list, sizeof (zio_link_t), - offsetof(zio_link_t, zl_parent_node)); - list_create(&zio->io_child_list, sizeof (zio_link_t), - offsetof(zio_link_t, zl_child_node)); + zio = kmem_cache_alloc(zio_cache, KM_PUSHPAGE); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; @@ -509,6 +594,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { + zio->io_logical = NULL; zio->io_bp = (blkptr_t *)bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; @@ -519,21 +605,54 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; + } else { + zio->io_logical = NULL; + zio->io_bp = NULL; + bzero(&zio->io_bp_copy, sizeof (blkptr_t)); + bzero(&zio->io_bp_orig, sizeof (blkptr_t)); } zio->io_spa = spa; zio->io_txg = txg; + zio->io_ready = NULL; zio->io_done = done; zio->io_private = private; + zio->io_prev_space_delta = 0; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; + zio->io_vsd = NULL; + zio->io_vsd_ops = NULL; zio->io_offset = offset; + zio->io_deadline = 0; + zio->io_timestamp = 0; + zio->io_delta = 0; + zio->io_delay = 0; zio->io_orig_data = zio->io_data = data; zio->io_orig_size = zio->io_size = size; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; + bzero(&zio->io_prop, sizeof (zio_prop_t)); + zio->io_cmd = 0; + zio->io_reexecute = 0; + zio->io_bp_override = NULL; + zio->io_walk_link = NULL; + zio->io_transform_stack = NULL; + zio->io_error = 0; + zio->io_child_count = 0; + zio->io_parent_count = 0; + zio->io_stall = NULL; + zio->io_gang_leader = NULL; + zio->io_gang_tree = NULL; + zio->io_executor = NULL; + zio->io_waiter = NULL; + zio->io_cksum_report = NULL; + zio->io_ena = 0; + bzero(zio->io_child_error, sizeof (int) * ZIO_CHILD_TYPES); + bzero(zio->io_children, + sizeof (uint64_t) * ZIO_CHILD_TYPES * ZIO_WAIT_TYPES); + bzero(&zio->io_bookmark, sizeof (zbookmark_t)); zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); @@ -549,16 +668,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_add_child(pio, zio); } + taskq_init_ent(&zio->io_tqent); + return (zio); } static void zio_destroy(zio_t *zio) { - list_destroy(&zio->io_parent_list); - list_destroy(&zio->io_child_list); - mutex_destroy(&zio->io_lock); - cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } @@ -609,7 +726,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && - zp->zp_type < DMU_OT_NUMTYPES && + DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa) && @@ -670,7 +787,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); - ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); + ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, @@ -893,7 +1010,7 @@ zio_read_bp_init(zio_t *zio) zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } - if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) + if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) @@ -967,7 +1084,7 @@ zio_write_bp_init(zio_t *zio) ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); - if (pass > SYNC_PASS_DONT_COMPRESS) + if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ @@ -996,7 +1113,7 @@ zio_write_bp_init(zio_t *zio) * There should only be a handful of blocks after pass 1 in any case. */ if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && - pass > SYNC_PASS_REWRITE) { + pass >= zfs_sync_pass_rewrite) { enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; ASSERT(psize != 0); zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; @@ -1048,11 +1165,11 @@ zio_free_bp_init(zio_t *zio) */ static void -zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) +zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; - int flags = TQ_NOSLEEP | (cutinline ? TQ_FRONT : 0); + int flags = (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and @@ -1069,28 +1186,40 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) t = ZIO_TYPE_NULL; /* - * If this is a high priority I/O, then use the high priority taskq. + * If this is a high priority I/O, then use the high priority taskq if + * available. */ if (zio->io_priority == ZIO_PRIORITY_NOW && - spa->spa_zio_taskq[t][q + 1] != NULL) + spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); - while (taskq_dispatch(spa->spa_zio_taskq[t][q], - (task_func_t *)zio_execute, zio, flags) == 0); /* do nothing */ + /* + * NB: We are assuming that the zio can only be dispatched + * to a single taskq at a time. It would be a grievous error + * to dispatch the zio to another taskq at the same time. + */ + ASSERT(taskq_empty_ent(&zio->io_tqent)); + spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, + flags, &zio->io_tqent); } static boolean_t -zio_taskq_member(zio_t *zio, enum zio_taskq_type q) +zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { kthread_t *executor = zio->io_executor; spa_t *spa = zio->io_spa; zio_type_t t; - for (t = 0; t < ZIO_TYPES; t++) - if (taskq_member(spa->spa_zio_taskq[t][q], executor)) - return (B_TRUE); + for (t = 0; t < ZIO_TYPES; t++) { + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + uint_t i; + for (i = 0; i < tqs->stqs_count; i++) { + if (taskq_member(tqs->stqs_taskq[i], executor)) + return (B_TRUE); + } + } return (B_FALSE); } @@ -1147,6 +1276,8 @@ __zio_execute(zio_t *zio) while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; + dsl_pool_t *dp; + boolean_t cut; int rv; ASSERT(!MUTEX_HELD(&zio->io_lock)); @@ -1159,6 +1290,10 @@ __zio_execute(zio_t *zio) ASSERT(stage <= ZIO_STAGE_DONE); + dp = spa_get_dsl(zio->io_spa); + cut = (stage == ZIO_STAGE_VDEV_IO_START) ? + zio_requeue_io_start_cut_in_line : B_FALSE; + /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, @@ -1170,12 +1305,25 @@ __zio_execute(zio_t *zio) */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { - boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? - zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } +#ifdef _KERNEL + /* + * If we executing in the context of the tx_sync_thread, + * or we are performing pool initialization outside of a + * zio_taskq[ZIO_TASKQ_ISSUE] context. Then issue the zio + * async to minimize stack usage for these deep call paths. + */ + if ((dp && curthread == dp->dp_tx.tx_sync_thread) || + (dp && spa_is_initializing(dp->dp_spa) && + !zio_taskq_member(zio, ZIO_TASKQ_ISSUE))) { + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); + return; + } +#endif + zio->io_stage = stage; rv = zio_pipeline[highbit(stage) - 1](zio); @@ -1206,7 +1354,7 @@ zio_wait(zio_t *zio) mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) - cv_wait(&zio->io_cv, &zio->io_lock); + cv_wait_io(&zio->io_cv, &zio->io_lock); mutex_exit(&zio->io_lock); error = zio->io_error; @@ -1500,7 +1648,7 @@ zio_gang_node_alloc(zio_gang_node_t **gnpp) ASSERT(*gnpp == NULL); - gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); + gn = kmem_zalloc(sizeof (*gn), KM_PUSHPAGE); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; @@ -1751,6 +1899,11 @@ zio_write_gang_block(zio_t *pio) */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + /* + * We didn't allocate this bp, so make sure it doesn't get unmarked. + */ + pio->io_flags &= ~ZIO_FLAG_FASTWRITE; + zio_nowait(zio); return (ZIO_PIPELINE_CONTINUE); @@ -2120,8 +2273,11 @@ zio_ddt_free(zio_t *zio) ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); - ddp = ddt_phys_select(dde, bp); - ddt_phys_decref(ddp); + if (dde) { + ddp = ddt_phys_select(dde, bp); + if (ddp) + ddt_phys_decref(ddp); + } ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); @@ -2139,6 +2295,7 @@ zio_dva_allocate(zio_t *zio) metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = zio->io_bp; int error; + int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); @@ -2146,15 +2303,27 @@ zio_dva_allocate(zio_t *zio) } ASSERT(BP_IS_HOLE(bp)); - ASSERT3U(BP_GET_NDVAS(bp), ==, 0); + ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); + /* + * The dump device does not support gang blocks so allocation on + * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid + * the "fast" gang feature. + */ + flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; + flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? + METASLAB_GANG_CHILD : 0; + flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_copies, zio->io_txg, NULL, 0); + zio->io_prop.zp_copies, zio->io_txg, NULL, flags); if (error) { + spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " + "size %llu, error %d", spa_name(spa), zio, zio->io_size, + error); if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) return (zio_write_gang_block(zio)); zio->io_error = error; @@ -2211,20 +2380,29 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int -zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, - uint64_t size, boolean_t use_slog) +zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, + boolean_t use_slog) { int error = 1; ASSERT(txg > spa_syncing_txg(spa)); - if (use_slog) + /* + * ZIL blocks are always contiguous (i.e. not gang blocks) so we + * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" + * when allocating them. + */ + if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, - new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); + new_bp, 1, txg, NULL, + METASLAB_FASTWRITE | METASLAB_GANG_AVOID); + } - if (error) + if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); + new_bp, 1, txg, NULL, + METASLAB_FASTWRITE | METASLAB_GANG_AVOID); + } if (error == 0) { BP_SET_LSIZE(new_bp, size); @@ -2751,6 +2929,17 @@ zio_done(zio_t *zio) vdev_stat_update(zio, zio->io_size); + /* + * If this I/O is attached to a particular vdev is slow, exceeding + * 30 seconds to complete, post an error described the I/O delay. + * We ignore these errors if the device is currently unavailable. + */ + if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) { + if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) + zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, + zio->io_vd, zio, 0, 0); + } + if (zio->io_error) { /* * If this I/O is attached to a particular vdev, @@ -2894,9 +3083,11 @@ zio_done(zio_t *zio) * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ - (void) taskq_dispatch( - zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], - (task_func_t *)zio_reexecute, zio, TQ_SLEEP); + ASSERT(taskq_empty_ent(&zio->io_tqent)); + spa_taskq_dispatch_ent(zio->io_spa, + ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, + (task_func_t *)zio_reexecute, zio, 0, + &zio->io_tqent); } return (ZIO_PIPELINE_STOP); } @@ -2916,6 +3107,11 @@ zio_done(zio_t *zio) zfs_ereport_free_checksum(zcr); } + if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && + !BP_IS_HOLE(zio->io_bp)) { + metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); + } + /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as @@ -2975,3 +3171,67 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_checksum_verify, zio_done }; + +/* dnp is the dnode for zb1->zb_object */ +boolean_t +zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, + const zbookmark_t *zb2) +{ + uint64_t zb1nextL0, zb2thisobj; + + ASSERT(zb1->zb_objset == zb2->zb_objset); + ASSERT(zb2->zb_level == 0); + + /* + * A bookmark in the deadlist is considered to be after + * everything else. + */ + if (zb2->zb_object == DMU_DEADLIST_OBJECT) + return (B_TRUE); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + zb1nextL0 = (zb1->zb_blkid + 1) << + ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + + zb2thisobj = zb2->zb_object ? zb2->zb_object : + zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + + if (zb1->zb_object == DMU_META_DNODE_OBJECT) { + uint64_t nextobj = zb1nextL0 * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; + return (nextobj <= zb2thisobj); + } + + if (zb1->zb_object < zb2thisobj) + return (B_TRUE); + if (zb1->zb_object > zb2thisobj) + return (B_FALSE); + if (zb2->zb_object == DMU_META_DNODE_OBJECT) + return (B_FALSE); + return (zb1nextL0 <= zb2->zb_blkid); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +/* Fault injection */ +EXPORT_SYMBOL(zio_injection_enabled); +EXPORT_SYMBOL(zio_inject_fault); +EXPORT_SYMBOL(zio_inject_list_next); +EXPORT_SYMBOL(zio_clear_fault); +EXPORT_SYMBOL(zio_handle_fault_injection); +EXPORT_SYMBOL(zio_handle_device_injection); +EXPORT_SYMBOL(zio_handle_label_injection); +EXPORT_SYMBOL(zio_priority_table); +EXPORT_SYMBOL(zio_type_name); + +module_param(zio_bulk_flags, int, 0644); +MODULE_PARM_DESC(zio_bulk_flags, "Additional flags to pass to bulk buffers"); + +module_param(zio_delay_max, int, 0644); +MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event"); + +module_param(zio_requeue_io_start_cut_in_line, int, 0644); +MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O"); +#endif