X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fspa.c;h=e022c325864b3d7767cd1de9e5d045536472351d;hb=7ef5e54e2e28884a04dc800657967b891239e933;hp=244f10d47e92c1e41be037f28019c237754e3881;hpb=9ae529ec5dbdc828ff8326beae58062971d74b2e;p=zfs.git diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 244f10d..e022c32 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -78,23 +78,25 @@ #include "zfs_comutil.h" typedef enum zti_modes { - zti_mode_fixed, /* value is # of threads (min 1) */ - zti_mode_online_percent, /* value is % of online CPUs */ - zti_mode_batch, /* cpu-intensive; value is ignored */ - zti_mode_null, /* don't create a taskq */ - zti_nmodes + ZTI_MODE_FIXED, /* value is # of threads (min 1) */ + ZTI_MODE_ONLINE_PERCENT, /* value is % of online CPUs */ + ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ + ZTI_MODE_NULL, /* don't create a taskq */ + ZTI_NMODES } zti_modes_t; -#define ZTI_FIX(n) { zti_mode_fixed, (n) } -#define ZTI_PCT(n) { zti_mode_online_percent, (n) } -#define ZTI_BATCH { zti_mode_batch, 0 } -#define ZTI_NULL { zti_mode_null, 0 } +#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } +#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } +#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } +#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } -#define ZTI_ONE ZTI_FIX(1) +#define ZTI_N(n) ZTI_P(n, 1) +#define ZTI_ONE ZTI_N(1) typedef struct zio_taskq_info { - enum zti_modes zti_mode; + zti_modes_t zti_mode; uint_t zti_value; + uint_t zti_count; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { @@ -102,21 +104,36 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { }; /* - * Define the taskq threads for the following I/O types: - * NULL, READ, WRITE, FREE, CLAIM, and IOCTL + * This table defines the taskq settings for each ZFS I/O type. When + * initializing a pool, we use this table to create an appropriately sized + * taskq. Some operations are low volume and therefore have a small, static + * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE + * macros. Other operations process a large amount of data; the ZTI_BATCH + * macro causes us to create a taskq oriented for throughput. Some operations + * are so high frequency and short-lived that the taskq itself can become a a + * point of lock contention. The ZTI_P(#, #) macro indicates that we need an + * additional degree of parallelism specified by the number of threads per- + * taskq and the number of taskqs; when dispatching an event in this case, the + * particular taskq is chosen at random. + * + * The different taskq priorities are to handle the different contexts (issue + * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that + * need to be handled with minimum delay. */ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, - { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, - { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(16), ZTI_FIX(5) }, - { ZTI_PCT(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ + { ZTI_N(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, /* READ */ + { ZTI_BATCH, ZTI_N(5), ZTI_N(16), ZTI_N(5) }, /* WRITE */ + { ZTI_P(4, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ }; static dsl_syncfunc_t spa_sync_version; static dsl_syncfunc_t spa_sync_props; +static dsl_checkfunc_t spa_change_guid_check; +static dsl_syncfunc_t spa_change_guid_sync; static boolean_t spa_has_active_shared_spare(spa_t *spa); static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, @@ -676,6 +693,47 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) } } +/*ARGSUSED*/ +static int +spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + vdev_t *rvd = spa->spa_root_vdev; + uint64_t vdev_state; + ASSERTV(uint64_t *newguid = arg2); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_state = rvd->vdev_state; + spa_config_exit(spa, SCL_STATE, FTAG); + + if (vdev_state != VDEV_STATE_HEALTHY) + return (ENXIO); + + ASSERT3U(spa_guid(spa), !=, *newguid); + + return (0); +} + +static void +spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + uint64_t *newguid = arg2; + uint64_t oldguid; + vdev_t *rvd = spa->spa_root_vdev; + + oldguid = spa_guid(spa); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + rvd->vdev_guid = *newguid; + rvd->vdev_guid_sum += (*newguid - oldguid); + vdev_config_dirty(rvd); + spa_config_exit(spa, SCL_STATE, FTAG); + + spa_history_log_internal(LOG_POOL_GUID_CHANGE, spa, tx, + "old=%lld new=%lld", oldguid, *newguid); +} + /* * Change the GUID for the pool. This is done so that we can later * re-import a pool built from a clone of our own vdevs. We will modify @@ -688,29 +746,23 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) int spa_change_guid(spa_t *spa) { - uint64_t oldguid, newguid; - uint64_t txg; - - if (!(spa_mode_global & FWRITE)) - return (EROFS); - - txg = spa_vdev_enter(spa); - - if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY) - return (spa_vdev_exit(spa, NULL, txg, ENXIO)); + int error; + uint64_t guid; - oldguid = spa_guid(spa); - newguid = spa_generate_guid(NULL); - ASSERT3U(oldguid, !=, newguid); + mutex_enter(&spa_namespace_lock); + guid = spa_generate_guid(NULL); - spa->spa_root_vdev->vdev_guid = newguid; - spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid); + error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check, + spa_change_guid_sync, spa, &guid, 5); - vdev_config_dirty(spa->spa_root_vdev); + if (error == 0) { + spa_config_sync(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID); + } - spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID); + mutex_exit(&spa_namespace_lock); - return (spa_vdev_exit(spa, NULL, txg, 0)); + return (error); } /* @@ -757,48 +809,121 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) offsetof(spa_error_entry_t, se_avl)); } -static taskq_t * -spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, - uint_t value) +static void +spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { - uint_t flags = TASKQ_PREPOPULATE; + const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; + enum zti_modes mode = ztip->zti_mode; + uint_t value = ztip->zti_value; + uint_t count = ztip->zti_count; + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + char name[32]; + uint_t i, flags = 0; boolean_t batch = B_FALSE; - switch (mode) { - case zti_mode_null: - return (NULL); /* no taskq needed */ + if (mode == ZTI_MODE_NULL) { + tqs->stqs_count = 0; + tqs->stqs_taskq = NULL; + return; + } - case zti_mode_fixed: - ASSERT3U(value, >=, 1); - value = MAX(value, 1); - break; + ASSERT3U(count, >, 0); - case zti_mode_batch: - batch = B_TRUE; - flags |= TASKQ_THREADS_CPU_PCT; - value = zio_taskq_batch_pct; - break; + tqs->stqs_count = count; + tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); - case zti_mode_online_percent: - flags |= TASKQ_THREADS_CPU_PCT; - break; + for (i = 0; i < count; i++) { + taskq_t *tq; - default: - panic("unrecognized mode for %s taskq (%u:%u) in " - "spa_activate()", - name, mode, value); - break; + switch (mode) { + case ZTI_MODE_FIXED: + ASSERT3U(value, >=, 1); + value = MAX(value, 1); + break; + + case ZTI_MODE_BATCH: + batch = B_TRUE; + flags |= TASKQ_THREADS_CPU_PCT; + value = zio_taskq_batch_pct; + break; + + case ZTI_MODE_ONLINE_PERCENT: + flags |= TASKQ_THREADS_CPU_PCT; + break; + + default: + panic("unrecognized mode for %s_%s taskq (%u:%u) in " + "spa_activate()", + zio_type_name[t], zio_taskq_types[q], mode, value); + break; + } + + if (count > 1) { + (void) snprintf(name, sizeof (name), "%s_%s_%u", + zio_type_name[t], zio_taskq_types[q], i); + } else { + (void) snprintf(name, sizeof (name), "%s_%s", + zio_type_name[t], zio_taskq_types[q]); + } + + if (zio_taskq_sysdc && spa->spa_proc != &p0) { + if (batch) + flags |= TASKQ_DC_BATCH; + + tq = taskq_create_sysdc(name, value, 50, INT_MAX, + spa->spa_proc, zio_taskq_basedc, flags); + } else { + tq = taskq_create_proc(name, value, maxclsyspri, 50, + INT_MAX, spa->spa_proc, flags); + } + + tqs->stqs_taskq[i] = tq; } +} - if (zio_taskq_sysdc && spa->spa_proc != &p0) { - if (batch) - flags |= TASKQ_DC_BATCH; +static void +spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) +{ + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + uint_t i; - return (taskq_create_sysdc(name, value, 50, INT_MAX, - spa->spa_proc, zio_taskq_basedc, flags)); + if (tqs->stqs_taskq == NULL) { + ASSERT3U(tqs->stqs_count, ==, 0); + return; } - return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, - spa->spa_proc, flags)); + + for (i = 0; i < tqs->stqs_count; i++) { + ASSERT3P(tqs->stqs_taskq[i], !=, NULL); + taskq_destroy(tqs->stqs_taskq[i]); + } + + kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); + tqs->stqs_taskq = NULL; +} + +/* + * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. + * Note that a type may have multiple discrete taskqs to avoid lock contention + * on the taskq itself. In that case we choose which taskq at random by using + * the low bits of gethrtime(). + */ +void +spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) +{ + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + taskq_t *tq; + + ASSERT3P(tqs->stqs_taskq, !=, NULL); + ASSERT3U(tqs->stqs_count, !=, 0); + + if (tqs->stqs_count == 1) { + tq = tqs->stqs_taskq[0]; + } else { + tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; + } + + taskq_dispatch_ent(tq, func, arg, flags, ent); } static void @@ -808,16 +933,7 @@ spa_create_zio_taskqs(spa_t *spa) for (t = 0; t < ZIO_TYPES; t++) { for (q = 0; q < ZIO_TASKQ_TYPES; q++) { - const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; - enum zti_modes mode = ztip->zti_mode; - uint_t value = ztip->zti_value; - char name[32]; - - (void) snprintf(name, sizeof (name), - "%s_%s", zio_type_name[t], zio_taskq_types[q]); - - spa->spa_zio_taskq[t][q] = - spa_taskq_create(spa, name, mode, value); + spa_taskqs_init(spa, t, q); } } } @@ -976,11 +1092,11 @@ spa_deactivate(spa_t *spa) list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_state_dirty_list); + taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); + for (t = 0; t < ZIO_TYPES; t++) { for (q = 0; q < ZIO_TASKQ_TYPES; q++) { - if (spa->spa_zio_taskq[t][q] != NULL) - taskq_destroy(spa->spa_zio_taskq[t][q]); - spa->spa_zio_taskq[t][q] = NULL; + spa_taskqs_fini(spa, t, q); } } @@ -2172,7 +2288,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; - nvlist_t *unsup_feat; + nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj) != 0) { @@ -2189,27 +2305,32 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } - VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == - 0); + enabled_feat = fnvlist_alloc(); + unsup_feat = fnvlist_alloc(); if (!feature_is_supported(spa->spa_meta_objset, spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, - unsup_feat)) + unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { if (!feature_is_supported(spa->spa_meta_objset, spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, - unsup_feat)) + unsup_feat, enabled_feat)) { missing_feat_write = B_TRUE; + } } + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); + if (!nvlist_empty(unsup_feat)) { - VERIFY(nvlist_add_nvlist(spa->spa_load_info, - ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); } - nvlist_free(unsup_feat); + fnvlist_free(enabled_feat); + fnvlist_free(unsup_feat); if (!missing_feat_read) { fnvlist_add_boolean(spa->spa_load_info, @@ -5700,6 +5821,14 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); + /* + * If we're upgrading the spa version then make sure that + * the config object gets updated with the correct version. + */ + if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) + fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa->spa_uberblock.ub_version); + spa_config_exit(spa, SCL_STATE, FTAG); if (spa->spa_config_syncing) @@ -5720,7 +5849,7 @@ spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) */ ASSERT(tx->tx_txg != TXG_INITIAL); - ASSERT(version <= SPA_VERSION); + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); ASSERT(version >= spa_version(spa)); spa->spa_uberblock.ub_version = version; @@ -5967,6 +6096,12 @@ spa_sync(spa_t *spa, uint64_t txg) tx = dmu_tx_create_assigned(dp, txg); + spa->spa_sync_starttime = gethrtime(); + taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); + spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq, + spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + + NSEC_TO_TICK(spa->spa_deadman_synctime)); + /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. @@ -6020,7 +6155,7 @@ spa_sync(spa_t *spa, uint64_t txg) spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); - if (pass <= SYNC_PASS_DEFERRED_FREE) { + if (pass < zfs_sync_pass_deferred_free) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(free_bpl, spa_free_sync_cb, zio, tx); @@ -6083,6 +6218,9 @@ spa_sync(spa_t *spa, uint64_t txg) rvd->vdev_children, txg, B_TRUE); } + if (error == 0) + spa->spa_last_synced_guid = rvd->vdev_guid; + spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) @@ -6092,6 +6230,9 @@ spa_sync(spa_t *spa, uint64_t txg) } dmu_tx_commit(tx); + taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); + spa->spa_deadman_tqid = 0; + /* * Clear the dirty config list. */ @@ -6241,7 +6382,7 @@ spa_upgrade(spa_t *spa, uint64_t version) * future version would result in an unopenable pool, this shouldn't be * possible. */ - ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); + ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); ASSERT(version >= spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version;