X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Farc.c;h=10317b64242b6b540daf042450c1d6f8b3938425;hb=570827e129ed81e066e894530bbe24642f473154;hp=e9db5340ef7c6ab7f06c89822002ae587d6d450e;hpb=b8864a233c569edcc57c686f3ea8cd1ae3b89153;p=zfs.git diff --git a/module/zfs/arc.c b/module/zfs/arc.c index e9db534..10317b6 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -104,6 +104,14 @@ * protected from simultaneous callbacks from arc_buf_evict() * and arc_do_user_evicts(). * + * It as also possible to register a callback which is run when the + * arc_meta_limit is reached and no buffers can be safely evicted. In + * this case the arc user should drop a reference on some arc buffers so + * they can be reclaimed and the arc_meta_limit honored. For example, + * when using the ZPL each dentry holds a references on a znode. These + * dentries must be pruned before the arc buffer holding the znode can + * be safely evicted. + * * Note that the majority of the performance stats are manipulated * with atomic operations. * @@ -120,17 +128,17 @@ #include #include #include -#include #include #include #ifdef _KERNEL #include #include #include -#include +#include #endif #include #include +#include #include static kmutex_t arc_reclaim_thr_lock; @@ -141,8 +149,8 @@ extern int zfs_write_limit_shift; extern uint64_t zfs_write_limit_max; extern kmutex_t zfs_write_limit_lock; -#define ARC_REDUCE_DNLC_PERCENT 3 -uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; +/* number of bytes to prune from caches when at arc_meta_limit is reached */ +uint_t arc_meta_prune = 1048576; typedef enum arc_reclaim_strategy { ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ @@ -174,12 +182,13 @@ static boolean_t arc_warm; /* * These tunables are for performance analysis. */ -uint64_t zfs_arc_max; -uint64_t zfs_arc_min; -uint64_t zfs_arc_meta_limit = 0; +unsigned long zfs_arc_max = 0; +unsigned long zfs_arc_min = 0; +unsigned long zfs_arc_meta_limit = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; +int zfs_arc_meta_prune = 0; /* * Note that buffers can be in one of 6 states: @@ -263,6 +272,21 @@ typedef struct arc_stats { kstat_named_t arcstat_hdr_size; kstat_named_t arcstat_data_size; kstat_named_t arcstat_other_size; + kstat_named_t arcstat_anon_size; + kstat_named_t arcstat_anon_evict_data; + kstat_named_t arcstat_anon_evict_metadata; + kstat_named_t arcstat_mru_size; + kstat_named_t arcstat_mru_evict_data; + kstat_named_t arcstat_mru_evict_metadata; + kstat_named_t arcstat_mru_ghost_size; + kstat_named_t arcstat_mru_ghost_evict_data; + kstat_named_t arcstat_mru_ghost_evict_metadata; + kstat_named_t arcstat_mfu_size; + kstat_named_t arcstat_mfu_evict_data; + kstat_named_t arcstat_mfu_evict_metadata; + kstat_named_t arcstat_mfu_ghost_size; + kstat_named_t arcstat_mfu_ghost_evict_data; + kstat_named_t arcstat_mfu_ghost_evict_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; @@ -282,6 +306,15 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_memory_direct_count; + kstat_named_t arcstat_memory_indirect_count; + kstat_named_t arcstat_no_grow; + kstat_named_t arcstat_tempreserve; + kstat_named_t arcstat_loaned_bytes; + kstat_named_t arcstat_prune; + kstat_named_t arcstat_meta_used; + kstat_named_t arcstat_meta_limit; + kstat_named_t arcstat_meta_max; } arc_stats_t; static arc_stats_t arc_stats = { @@ -319,6 +352,21 @@ static arc_stats_t arc_stats = { { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "other_size", KSTAT_DATA_UINT64 }, + { "anon_size", KSTAT_DATA_UINT64 }, + { "anon_evict_data", KSTAT_DATA_UINT64 }, + { "anon_evict_metadata", KSTAT_DATA_UINT64 }, + { "mru_size", KSTAT_DATA_UINT64 }, + { "mru_evict_data", KSTAT_DATA_UINT64 }, + { "mru_evict_metadata", KSTAT_DATA_UINT64 }, + { "mru_ghost_size", KSTAT_DATA_UINT64 }, + { "mru_ghost_evict_data", KSTAT_DATA_UINT64 }, + { "mru_ghost_evict_metadata", KSTAT_DATA_UINT64 }, + { "mfu_size", KSTAT_DATA_UINT64 }, + { "mfu_evict_data", KSTAT_DATA_UINT64 }, + { "mfu_evict_metadata", KSTAT_DATA_UINT64 }, + { "mfu_ghost_size", KSTAT_DATA_UINT64 }, + { "mfu_ghost_evict_data", KSTAT_DATA_UINT64 }, + { "mfu_ghost_evict_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, @@ -337,7 +385,16 @@ static arc_stats_t arc_stats = { { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 } + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "memory_direct_count", KSTAT_DATA_UINT64 }, + { "memory_indirect_count", KSTAT_DATA_UINT64 }, + { "arc_no_grow", KSTAT_DATA_UINT64 }, + { "arc_tempreserve", KSTAT_DATA_UINT64 }, + { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, + { "arc_prune", KSTAT_DATA_UINT64 }, + { "arc_meta_used", KSTAT_DATA_UINT64 }, + { "arc_meta_limit", KSTAT_DATA_UINT64 }, + { "arc_meta_max", KSTAT_DATA_UINT64 }, }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -399,13 +456,12 @@ static arc_state_t *arc_l2c_only; #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ - -static int arc_no_grow; /* Don't try to grow cache size */ -static uint64_t arc_tempreserve; -static uint64_t arc_loaned_bytes; -static uint64_t arc_meta_used; -static uint64_t arc_meta_limit; -static uint64_t arc_meta_max = 0; +#define arc_no_grow ARCSTAT(arcstat_no_grow) +#define arc_tempreserve ARCSTAT(arcstat_tempreserve) +#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) +#define arc_meta_used ARCSTAT(arcstat_meta_used) +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) +#define arc_meta_max ARCSTAT(arcstat_meta_max) typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; @@ -465,6 +521,8 @@ struct arc_buf_hdr { list_node_t b_l2node; }; +static list_t arc_prune_list; +static kmutex_t arc_prune_mtx; static arc_buf_t *arc_eviction_list; static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; @@ -523,12 +581,13 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); * Hash table routines */ -#define HT_LOCK_PAD 64 +#define HT_LOCK_ALIGN 64 +#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN))) struct ht_lock { kmutex_t ht_lock; #ifdef _KERNEL - unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; + unsigned char pad[HT_LOCK_PAD]; #endif }; @@ -565,14 +624,14 @@ uint64_t zfs_crc64_table[256]; /* * L2ARC Performance Tunables */ -uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ -uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ -uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ -uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ -boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ -boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ -boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ +unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ +unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ +unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ +unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ +int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +int l2arc_feed_again = B_TRUE; /* turbo warmup */ +int l2arc_norw = B_TRUE; /* no reads during writes */ /* * L2ARC Internals @@ -772,8 +831,15 @@ buf_fini(void) { int i; +#if defined(_KERNEL) && defined(HAVE_SPL) + /* Large allocations which do not require contiguous pages + * should be using vmem_free() in the linux kernel */ + vmem_free(buf_hash_table.ht_table, + (buf_hash_table.ht_mask + 1) * sizeof (void *)); +#else kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); +#endif for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_cache); @@ -794,6 +860,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag) refcount_create(&buf->b_refcnt); cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + list_link_init(&buf->b_arc_node); + list_link_init(&buf->b_l2node); arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); return (0); @@ -848,7 +916,6 @@ buf_dest(void *vbuf, void *unused) static void hdr_recl(void *unused) { - dprintf("hdr_recl called\n"); /* * umem calls the reclaim func when we destroy the buf cache, * which is after we do arc_fini(). @@ -873,8 +940,15 @@ buf_init(void) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; +#if defined(_KERNEL) && defined(HAVE_SPL) + /* Large allocations which do not require contiguous pages + * should be using vmem_alloc() in the linux kernel */ + buf_hash_table.ht_table = + vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); +#else buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); +#endif if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; @@ -1143,6 +1217,8 @@ arc_space_consume(uint64_t space, arc_space_type_t type) ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { + default: + break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; @@ -1167,6 +1243,8 @@ arc_space_return(uint64_t space, arc_space_type_t type) ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { + default: + break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; @@ -1704,7 +1782,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, mutex_exit(&state->arcs_mtx); if (bytes_evicted < bytes) - dprintf("only evicted %lld bytes from %x", + dprintf("only evicted %lld bytes from %x\n", (longlong_t)bytes_evicted, state); if (skipped) @@ -1745,13 +1823,14 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; - arc_buf_hdr_t marker = { 0 }; + arc_buf_hdr_t marker; list_t *list = &state->arcs_list[ARC_BUFC_DATA]; kmutex_t *hash_lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; ASSERT(GHOST_STATE(state)); + bzero(&marker, sizeof(marker)); top: mutex_enter(&state->arcs_mtx); for (ab = list_tail(list); ab; ab = ab_prev) { @@ -1819,7 +1898,7 @@ top: } if (bytes_deleted < bytes) - dprintf("only deleted %lld bytes from %p", + dprintf("only deleted %lld bytes from %p\n", (longlong_t)bytes_deleted, state); } @@ -1887,6 +1966,48 @@ arc_adjust(void) } } +/* + * Request that arc user drop references so that N bytes can be released + * from the cache. This provides a mechanism to ensure the arc can honor + * the arc_meta_limit and reclaim buffers which are pinned in the cache + * by higher layers. (i.e. the zpl) + */ +static void +arc_do_user_prune(int64_t adjustment) +{ + arc_prune_func_t *func; + void *private; + arc_prune_t *cp, *np; + + mutex_enter(&arc_prune_mtx); + + cp = list_head(&arc_prune_list); + while (cp != NULL) { + func = cp->p_pfunc; + private = cp->p_private; + np = list_next(&arc_prune_list, cp); + refcount_add(&cp->p_refcnt, func); + mutex_exit(&arc_prune_mtx); + + if (func != NULL) + func(adjustment, private); + + mutex_enter(&arc_prune_mtx); + + /* User removed prune callback concurrently with execution */ + if (refcount_remove(&cp->p_refcnt, func) == 0) { + ASSERT(!list_link_active(&cp->p_node)); + refcount_destroy(&cp->p_refcnt); + kmem_free(cp, sizeof (*cp)); + } + + cp = np; + } + + ARCSTAT_BUMP(arcstat_prune); + mutex_exit(&arc_prune_mtx); +} + static void arc_do_user_evicts(void) { @@ -1911,6 +2032,32 @@ arc_do_user_evicts(void) } /* + * Evict only meta data objects from the cache leaving the data objects. + * This is only used to enforce the tunable arc_meta_limit, if we are + * unable to evict enough buffers notify the user via the prune callback. + */ +void +arc_adjust_meta(int64_t adjustment, boolean_t may_prune) +{ + int64_t delta; + + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); + arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); + adjustment -= delta; + } + + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment); + arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); + adjustment -= delta; + } + + if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit)) + arc_do_user_prune(arc_meta_prune); +} + +/* * Flush all *evictable* data from the cache for the given spa. * NOTE: this will not touch "active" (i.e. referenced) data. */ @@ -1984,9 +2131,8 @@ arc_shrink(void) static int arc_reclaim_needed(void) { - uint64_t extra; - #ifdef _KERNEL + uint64_t extra; if (needfree) return (1); @@ -2049,22 +2195,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; -#ifdef _KERNEL - if (arc_meta_used >= arc_meta_limit) { - /* - * We are exceeding our meta-data cache limit. - * Purge some DNLC entries to release holds on meta-data. - */ - dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); - } -#if defined(__i386) - /* - * Reclaim unused memory from all kmem caches. - */ - kmem_reap(); -#endif -#endif - /* * An aggressive reclamation will shrink the cache size as well as * reap free buffers from the arc kmem caches. @@ -2082,6 +2212,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) kmem_cache_reap_now(zio_data_buf_cache[i]); } } + kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_cache); } @@ -2092,6 +2223,7 @@ arc_reclaim_thread(void) clock_t growtime = 0; arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; callb_cpr_t cpr; + int64_t prune; CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); @@ -2121,6 +2253,15 @@ arc_reclaim_thread(void) arc_no_grow = FALSE; } + /* + * Keep meta data usage within limits, arc_shrink() is not + * used to avoid collapsing the arc_c value when only the + * arc_meta_limit is being exceeded. + */ + prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit; + if (prune > 0) + arc_adjust_meta(prune, B_TRUE); + arc_adjust(); if (arc_eviction_list != NULL) @@ -2128,7 +2269,7 @@ arc_reclaim_thread(void) /* block until needed, or one second, whichever is shorter */ CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&arc_reclaim_thr_cv, + (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); } @@ -2139,6 +2280,59 @@ arc_reclaim_thread(void) thread_exit(); } +#ifdef _KERNEL +/* + * Under Linux the arc shrinker may be called for synchronous (direct) + * reclaim, or asynchronous (indirect) reclaim. When called by kswapd + * for indirect reclaim we take a conservative approach and just reap + * free slabs from the ARC caches. If this proves to be insufficient + * direct reclaim will be trigger. In direct reclaim a more aggressive + * strategy is used, data is evicted from the ARC and free slabs reaped. + */ +static int +__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) +{ + arc_reclaim_strategy_t strategy; + int arc_reclaim; + + /* Return number of reclaimable pages based on arc_shrink_shift */ + arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min)) + >> arc_shrink_shift, 0); + if (sc->nr_to_scan == 0) + return (arc_reclaim); + + /* Prevent reclaim below arc_c_min */ + if (arc_reclaim <= 0) + return (-1); + + /* Not allowed to perform filesystem reclaim */ + if (!(sc->gfp_mask & __GFP_FS)) + return (-1); + + /* Reclaim in progress */ + if (mutex_tryenter(&arc_reclaim_thr_lock) == 0) + return (-1); + + if (current_is_kswapd()) { + strategy = ARC_RECLAIM_CONS; + ARCSTAT_INCR(arcstat_memory_indirect_count, 1); + } else { + strategy = ARC_RECLAIM_AGGR; + ARCSTAT_INCR(arcstat_memory_direct_count, 1); + } + + arc_kmem_reap_now(strategy); + arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min)) + >> arc_shrink_shift, 0); + mutex_exit(&arc_reclaim_thr_lock); + + return (arc_reclaim); +} +SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func); + +SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS); +#endif /* _KERNEL */ + /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called @@ -2303,16 +2497,27 @@ arc_get_data_buf(arc_buf_t *buf) state = (arc_mru->arcs_lsize[type] >= size && mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } + if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); + + /* + * If we are unable to recycle an existing meta buffer + * signal the reclaim thread. It will notify users + * via the prune callback to drop references. The + * prune callback in run in the context of the reclaim + * thread to avoid deadlocking on the hash_lock. + */ + cv_signal(&arc_reclaim_thr_cv); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } + ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); @@ -2674,7 +2879,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, uint32_t *arc_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr; - arc_buf_t *buf; + arc_buf_t *buf = NULL; kmutex_t *hash_lock; zio_t *rzio; uint64_t guid = spa_guid(spa); @@ -2699,7 +2904,7 @@ top: arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), - KM_SLEEP); + KM_PUSHPAGE); acb->acb_done = done; acb->acb_private = private; if (pio != NULL) @@ -2756,7 +2961,7 @@ top: uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; - uint64_t addr; + uint64_t addr = -1; boolean_t devw = B_FALSE; if (hdr == NULL) { @@ -2815,7 +3020,7 @@ top: ASSERT(!GHOST_STATE(hdr->b_state)); - acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); + acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE); acb->acb_done = done; acb->acb_private = private; @@ -2864,7 +3069,7 @@ top: ARCSTAT_BUMP(arcstat_l2_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), - KM_SLEEP); + KM_PUSHPAGE); cb->l2rcb_buf = buf; cb->l2rcb_spa = spa; cb->l2rcb_bp = *bp; @@ -2925,6 +3130,37 @@ top: return (0); } +arc_prune_t * +arc_add_prune_callback(arc_prune_func_t *func, void *private) +{ + arc_prune_t *p; + + p = kmem_alloc(sizeof(*p), KM_SLEEP); + p->p_pfunc = func; + p->p_private = private; + list_link_init(&p->p_node); + refcount_create(&p->p_refcnt); + + mutex_enter(&arc_prune_mtx); + refcount_add(&p->p_refcnt, &arc_prune_list); + list_insert_head(&arc_prune_list, p); + mutex_exit(&arc_prune_mtx); + + return (p); +} + +void +arc_remove_prune_callback(arc_prune_t *p) +{ + mutex_enter(&arc_prune_mtx); + list_remove(&arc_prune_list, p); + if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) { + refcount_destroy(&p->p_refcnt); + kmem_free(p, sizeof (*p)); + } + mutex_exit(&arc_prune_mtx); +} + void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { @@ -3034,7 +3270,7 @@ arc_release(arc_buf_t *buf, void *tag) arc_buf_hdr_t *hdr; kmutex_t *hash_lock = NULL; l2arc_buf_hdr_t *l2hdr; - uint64_t buf_size; + uint64_t buf_size = 0; /* * It would be nice to assert that if it's DMU metadata (level > @@ -3349,6 +3585,7 @@ arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) } else if (page_load > 0 && arc_reclaim_needed()) { /* memory is low, delay before restarting */ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); return (EAGAIN); } page_load = 0; @@ -3364,6 +3601,7 @@ arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) if (inflight_data > available_memory / 4) { ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + DMU_TX_STAT_BUMP(dmu_tx_memory_inflight); return (ERESTART); } #endif @@ -3394,8 +3632,10 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) #endif if (reserve > arc_c/4 && !arc_no_grow) arc_c = MIN(arc_c_max, reserve * 4); - if (reserve > arc_c) + if (reserve > arc_c) { + DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); return (ENOMEM); + } /* * Don't count loaned bufs as in flight dirty data to prevent long @@ -3409,7 +3649,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) * in order to compress/encrypt/etc the data. We therefor need to * make sure that there is sufficient available memory for this. */ - if (error = arc_memory_throttle(reserve, anon_size, txg)) + if ((error = arc_memory_throttle(reserve, anon_size, txg))) return (error); /* @@ -3428,12 +3668,55 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, reserve>>10, arc_c>>10); + DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (ERESTART); } atomic_add_64(&arc_tempreserve, reserve); return (0); } +static void +arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, + kstat_named_t *evict_data, kstat_named_t *evict_metadata) +{ + size->value.ui64 = state->arcs_size; + evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; + evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; +} + +static int +arc_kstat_update(kstat_t *ksp, int rw) +{ + arc_stats_t *as = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + return (EACCES); + } else { + arc_kstat_update_state(arc_anon, + &as->arcstat_anon_size, + &as->arcstat_anon_evict_data, + &as->arcstat_anon_evict_metadata); + arc_kstat_update_state(arc_mru, + &as->arcstat_mru_size, + &as->arcstat_mru_evict_data, + &as->arcstat_mru_evict_metadata); + arc_kstat_update_state(arc_mru_ghost, + &as->arcstat_mru_ghost_size, + &as->arcstat_mru_ghost_evict_data, + &as->arcstat_mru_ghost_evict_metadata); + arc_kstat_update_state(arc_mfu, + &as->arcstat_mfu_size, + &as->arcstat_mfu_evict_data, + &as->arcstat_mfu_evict_metadata); + arc_kstat_update_state(arc_mru_ghost, + &as->arcstat_mfu_ghost_size, + &as->arcstat_mfu_ghost_evict_data, + &as->arcstat_mfu_ghost_evict_metadata); + } + + return (0); +} + void arc_init(void) { @@ -3453,16 +3736,22 @@ arc_init(void) * need to limit the cache to 1/8 of VM size. */ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); + /* + * Register a shrinker to support synchronous (direct) memory + * reclaim from the arc. This is done to prevent kswapd from + * swapping out pages when it is preferable to shrink the arc. + */ + spl_register_shrinker(&arc_shrinker); #endif /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ arc_c_min = MAX(arc_c / 4, 64<<20); - /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ - if (arc_c * 8 >= 1<<30) - arc_c_max = (arc_c * 8) - (1<<30); + /* set max to 1/2 of all memory, or all but 4GB, whichever is more */ + if (arc_c * 8 >= ((uint64_t)4<<30)) + arc_c_max = (arc_c * 8) - ((uint64_t)4<<30); else arc_c_max = arc_c_min; - arc_c_max = MAX(arc_c * 6, arc_c_max); + arc_c_max = MAX(arc_c * 4, arc_c_max); /* * Allow the tunables to override our calculations if they are @@ -3478,6 +3767,7 @@ arc_init(void) /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; + arc_meta_max = 0; /* Allow the tunable to override if it is reasonable */ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) @@ -3495,6 +3785,9 @@ arc_init(void) if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift; + if (zfs_arc_meta_prune > 0) + arc_meta_prune = zfs_arc_meta_prune; + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -3540,7 +3833,10 @@ arc_init(void) buf_init(); arc_thread_exit = 0; + list_create(&arc_prune_list, sizeof (arc_prune_t), + offsetof(arc_prune_t, p_node)); arc_eviction_list = NULL; + mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); @@ -3549,6 +3845,7 @@ arc_init(void) if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; + arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } @@ -3568,7 +3865,13 @@ arc_init(void) void arc_fini(void) { + arc_prune_t *p; + mutex_enter(&arc_reclaim_thr_lock); +#ifdef _KERNEL + spl_unregister_shrinker(&arc_shrinker); +#endif /* _KERNEL */ + arc_thread_exit = 1; while (arc_thread_exit != 0) cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); @@ -3583,6 +3886,17 @@ arc_fini(void) arc_ksp = NULL; } + mutex_enter(&arc_prune_mtx); + while ((p = list_head(&arc_prune_list)) != NULL) { + list_remove(&arc_prune_list, p); + refcount_remove(&p->p_refcnt, &arc_prune_list); + refcount_destroy(&p->p_refcnt); + kmem_free(p, sizeof (*p)); + } + mutex_exit(&arc_prune_mtx); + + list_destroy(&arc_prune_list); + mutex_destroy(&arc_prune_mtx); mutex_destroy(&arc_eviction_mtx); mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); @@ -3878,7 +4192,7 @@ out: * Free buffers that were tagged for destruction. */ static void -l2arc_do_free_on_write() +l2arc_do_free_on_write(void) { list_t *buflist; l2arc_data_free_t *df, *df_prev; @@ -4058,7 +4372,7 @@ l2arc_read_done(zio_t *zio) static list_t * l2arc_list_locked(int list_num, kmutex_t **lock) { - list_t *list; + list_t *list = NULL; ASSERT(list_num >= 0 && list_num <= 3); @@ -4231,7 +4545,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) list_t *list; uint64_t passed_sz, write_sz, buf_sz, headroom; void *buf_data; - kmutex_t *hash_lock, *list_lock; + kmutex_t *hash_lock, *list_lock = NULL; boolean_t have_lock, full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; @@ -4414,8 +4728,8 @@ l2arc_feed_thread(void) while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, - next); + (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv, + &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; @@ -4533,6 +4847,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; + list_link_init(&adddev->l2ad_node); ASSERT3U(adddev->l2ad_write, >, 0); /* @@ -4658,3 +4973,57 @@ l2arc_stop(void) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } + +#if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(arc_read); +EXPORT_SYMBOL(arc_buf_remove_ref); +EXPORT_SYMBOL(arc_getbuf_func); +EXPORT_SYMBOL(arc_add_prune_callback); +EXPORT_SYMBOL(arc_remove_prune_callback); + +module_param(zfs_arc_min, ulong, 0444); +MODULE_PARM_DESC(zfs_arc_min, "Min arc size"); + +module_param(zfs_arc_max, ulong, 0444); +MODULE_PARM_DESC(zfs_arc_max, "Max arc size"); + +module_param(zfs_arc_meta_limit, ulong, 0444); +MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size"); + +module_param(zfs_arc_meta_prune, int, 0444); +MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune"); + +module_param(zfs_arc_grow_retry, int, 0444); +MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); + +module_param(zfs_arc_shrink_shift, int, 0444); +MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)"); + +module_param(zfs_arc_p_min_shift, int, 0444); +MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p"); + +module_param(l2arc_write_max, ulong, 0444); +MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval"); + +module_param(l2arc_write_boost, ulong, 0444); +MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup"); + +module_param(l2arc_headroom, ulong, 0444); +MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache"); + +module_param(l2arc_feed_secs, ulong, 0444); +MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing"); + +module_param(l2arc_feed_min_ms, ulong, 0444); +MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds"); + +module_param(l2arc_noprefetch, int, 0444); +MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers"); + +module_param(l2arc_feed_again, int, 0444); +MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup"); + +module_param(l2arc_norw, int, 0444); +MODULE_PARM_DESC(l2arc_norw, "No reads during writes"); + +#endif