X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Farc.c;h=dca17ab61d31220fa2ddde405506f1381cd0fb18;hb=c93504f03a0881992689069a8f78e17933dcd5b3;hp=32d99bf392d51991540f56e58526cc43a7d1a310;hpb=e06be586410cdad14d2dce76af4f2d43eebe7c83;p=zfs.git diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 32d99bf..dca17ab 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ /* @@ -78,9 +80,9 @@ * types of locks: 1) the hash table lock array, and 2) the * arc list locks. * - * Buffers do not have their own mutexs, rather they rely on the - * hash table mutexs for the bulk of their protection (i.e. most - * fields in the arc_buf_hdr_t are protected by these mutexs). + * Buffers do not have their own mutexes, rather they rely on the + * hash table mutexes for the bulk of their protection (i.e. most + * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns @@ -104,6 +106,14 @@ * protected from simultaneous callbacks from arc_buf_evict() * and arc_do_user_evicts(). * + * It as also possible to register a callback which is run when the + * arc_meta_limit is reached and no buffers can be safely evicted. In + * this case the arc user should drop a reference on some arc buffers so + * they can be reclaimed and the arc_meta_limit honored. For example, + * when using the ZPL each dentry holds a references on a znode. These + * dentries must be pruned before the arc buffer holding the znode can + * be safely evicted. + * * Note that the majority of the performance stats are manipulated * with atomic operations. * @@ -120,29 +130,25 @@ #include #include #include -#include #include #include #ifdef _KERNEL #include #include #include -#include +#include #endif #include #include +#include #include static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; -extern int zfs_write_limit_shift; -extern uint64_t zfs_write_limit_max; -extern kmutex_t zfs_write_limit_lock; - -#define ARC_REDUCE_DNLC_PERCENT 3 -uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; +/* number of bytes to prune from caches when at arc_meta_limit is reached */ +uint_t arc_meta_prune = 1048576; typedef enum arc_reclaim_strategy { ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ @@ -150,7 +156,10 @@ typedef enum arc_reclaim_strategy { } arc_reclaim_strategy_t; /* number of seconds before growing cache again */ -static int arc_grow_retry = 60; +static int arc_grow_retry = 5; + +/* expiration time for arc_no_grow */ +static clock_t arc_grow_time = 0; /* shift of arc_c for calculating both min and max arc_p */ static int arc_p_min_shift = 4; @@ -180,6 +189,9 @@ unsigned long zfs_arc_meta_limit = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; +int zfs_arc_memory_throttle_disable = 1; +int zfs_disable_dup_eviction = 0; +int zfs_arc_meta_prune = 0; /* * Note that buffers can be in one of 6 states: @@ -263,6 +275,21 @@ typedef struct arc_stats { kstat_named_t arcstat_hdr_size; kstat_named_t arcstat_data_size; kstat_named_t arcstat_other_size; + kstat_named_t arcstat_anon_size; + kstat_named_t arcstat_anon_evict_data; + kstat_named_t arcstat_anon_evict_metadata; + kstat_named_t arcstat_mru_size; + kstat_named_t arcstat_mru_evict_data; + kstat_named_t arcstat_mru_evict_metadata; + kstat_named_t arcstat_mru_ghost_size; + kstat_named_t arcstat_mru_ghost_evict_data; + kstat_named_t arcstat_mru_ghost_evict_metadata; + kstat_named_t arcstat_mfu_size; + kstat_named_t arcstat_mfu_evict_data; + kstat_named_t arcstat_mfu_evict_metadata; + kstat_named_t arcstat_mfu_ghost_size; + kstat_named_t arcstat_mfu_ghost_evict_data; + kstat_named_t arcstat_mfu_ghost_evict_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; @@ -282,6 +309,18 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_duplicate_buffers; + kstat_named_t arcstat_duplicate_buffers_size; + kstat_named_t arcstat_duplicate_reads; + kstat_named_t arcstat_memory_direct_count; + kstat_named_t arcstat_memory_indirect_count; + kstat_named_t arcstat_no_grow; + kstat_named_t arcstat_tempreserve; + kstat_named_t arcstat_loaned_bytes; + kstat_named_t arcstat_prune; + kstat_named_t arcstat_meta_used; + kstat_named_t arcstat_meta_limit; + kstat_named_t arcstat_meta_max; } arc_stats_t; static arc_stats_t arc_stats = { @@ -319,6 +358,21 @@ static arc_stats_t arc_stats = { { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "other_size", KSTAT_DATA_UINT64 }, + { "anon_size", KSTAT_DATA_UINT64 }, + { "anon_evict_data", KSTAT_DATA_UINT64 }, + { "anon_evict_metadata", KSTAT_DATA_UINT64 }, + { "mru_size", KSTAT_DATA_UINT64 }, + { "mru_evict_data", KSTAT_DATA_UINT64 }, + { "mru_evict_metadata", KSTAT_DATA_UINT64 }, + { "mru_ghost_size", KSTAT_DATA_UINT64 }, + { "mru_ghost_evict_data", KSTAT_DATA_UINT64 }, + { "mru_ghost_evict_metadata", KSTAT_DATA_UINT64 }, + { "mfu_size", KSTAT_DATA_UINT64 }, + { "mfu_evict_data", KSTAT_DATA_UINT64 }, + { "mfu_evict_metadata", KSTAT_DATA_UINT64 }, + { "mfu_ghost_size", KSTAT_DATA_UINT64 }, + { "mfu_ghost_evict_data", KSTAT_DATA_UINT64 }, + { "mfu_ghost_evict_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, @@ -337,7 +391,19 @@ static arc_stats_t arc_stats = { { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 } + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "duplicate_buffers", KSTAT_DATA_UINT64 }, + { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, + { "duplicate_reads", KSTAT_DATA_UINT64 }, + { "memory_direct_count", KSTAT_DATA_UINT64 }, + { "memory_indirect_count", KSTAT_DATA_UINT64 }, + { "arc_no_grow", KSTAT_DATA_UINT64 }, + { "arc_tempreserve", KSTAT_DATA_UINT64 }, + { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, + { "arc_prune", KSTAT_DATA_UINT64 }, + { "arc_meta_used", KSTAT_DATA_UINT64 }, + { "arc_meta_limit", KSTAT_DATA_UINT64 }, + { "arc_meta_max", KSTAT_DATA_UINT64 }, }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -399,13 +465,12 @@ static arc_state_t *arc_l2c_only; #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ - -static int arc_no_grow; /* Don't try to grow cache size */ -static uint64_t arc_tempreserve; -static uint64_t arc_loaned_bytes; -static uint64_t arc_meta_used; -static uint64_t arc_meta_limit; -static uint64_t arc_meta_max = 0; +#define arc_no_grow ARCSTAT(arcstat_no_grow) +#define arc_tempreserve ARCSTAT(arcstat_tempreserve) +#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) +#define arc_meta_used ARCSTAT(arcstat_meta_used) +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) +#define arc_meta_max ARCSTAT(arcstat_meta_max) typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; @@ -465,6 +530,8 @@ struct arc_buf_hdr { list_node_t b_l2node; }; +static list_t arc_prune_list; +static kmutex_t arc_prune_mtx; static arc_buf_t *arc_eviction_list; static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; @@ -566,14 +633,14 @@ uint64_t zfs_crc64_table[256]; /* * L2ARC Performance Tunables */ -uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ -uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ -uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ -uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ -boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ -boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ -boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ +unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ +unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ +unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ +unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ +int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +int l2arc_feed_again = B_TRUE; /* turbo warmup */ +int l2arc_norw = B_FALSE; /* no reads during writes */ /* * L2ARC Internals @@ -817,7 +884,6 @@ buf_cons(void *vbuf, void *unused, int kmflag) bzero(buf, sizeof (arc_buf_t)); mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); @@ -847,26 +913,9 @@ buf_dest(void *vbuf, void *unused) arc_buf_t *buf = vbuf; mutex_destroy(&buf->b_evict_lock); - rw_destroy(&buf->b_data_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } -/* - * Reclaim callback -- invoked when memory is low. - */ -/* ARGSUSED */ -static void -hdr_recl(void *unused) -{ - dprintf("hdr_recl called\n"); - /* - * umem calls the reclaim func when we destroy the buf cache, - * which is after we do arc_fini(). - */ - if (!arc_dead) - cv_signal(&arc_reclaim_thr_cv); -} - static void buf_init(void) { @@ -899,7 +948,7 @@ retry: } hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), - 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); + 0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); @@ -960,7 +1009,8 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) mutex_exit(&buf->b_hdr->b_freeze_lock); return; } - buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); + buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), + KM_PUSHPAGE); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); mutex_exit(&buf->b_hdr->b_freeze_lock); @@ -1025,7 +1075,7 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) ASSERT(list_link_active(&ab->b_arc_node)); list_remove(list, ab); if (GHOST_STATE(ab->b_state)) { - ASSERT3U(ab->b_datacnt, ==, 0); + ASSERT0(ab->b_datacnt); ASSERT3P(ab->b_buf, ==, NULL); delta = ab->b_size; } @@ -1238,7 +1288,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; - hdr->b_spa = spa_guid(spa); + hdr->b_spa = spa_load_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); @@ -1325,6 +1375,17 @@ arc_buf_clone(arc_buf_t *from) hdr->b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); + + /* + * This buffer already exists in the arc so create a duplicate + * copy for the caller. If the buffer is associated with user data + * then track the size and number of duplicates. These stats will be + * updated as duplicate buffers are created and destroyed. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMP(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); + } hdr->b_datacnt += 1; return (buf); } @@ -1372,7 +1433,7 @@ arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), { if (HDR_L2_WRITING(hdr)) { l2arc_data_free_t *df; - df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE); df->l2df_data = data; df->l2df_size = size; df->l2df_func = free_func; @@ -1423,6 +1484,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); buf->b_data = NULL; + + /* + * If we're destroying a duplicate buffer make sure + * that the appropriate statistics are updated. + */ + if (buf->b_hdr->b_datacnt > 1 && + buf->b_hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); + } ASSERT(buf->b_hdr->b_datacnt > 0); buf->b_hdr->b_datacnt -= 1; } @@ -1474,6 +1545,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) list_remove(l2hdr->b_dev->l2ad_buflist, hdr); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); if (hdr->b_state == arc_l2c_only) l2arc_hdr_stat_remove(); hdr->b_l2hdr = NULL; @@ -1571,7 +1643,7 @@ int arc_buf_remove_ref(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); + kmutex_t *hash_lock = NULL; int no_callback = (buf->b_efunc == NULL); if (hdr->b_state == arc_anon) { @@ -1580,6 +1652,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) return (no_callback); } + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); @@ -1608,6 +1681,48 @@ arc_buf_size(arc_buf_t *buf) } /* + * Called from the DMU to determine if the current buffer should be + * evicted. In order to ensure proper locking, the eviction must be initiated + * from the DMU. Return true if the buffer is associated with user data and + * duplicate buffers still exist. + */ +boolean_t +arc_buf_eviction_needed(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + boolean_t evict_needed = B_FALSE; + + if (zfs_disable_dup_eviction) + return (B_FALSE); + + mutex_enter(&buf->b_evict_lock); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(); let that function + * perform the eviction. + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&buf->b_evict_lock); + return (B_FALSE); + } else if (buf->b_data == NULL) { + /* + * We have already been added to the arc eviction list; + * recommend eviction. + */ + ASSERT3P(hdr, ==, &arc_eviction_hdr); + mutex_exit(&buf->b_evict_lock); + return (B_TRUE); + } + + if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) + evict_needed = B_TRUE; + + mutex_exit(&buf->b_evict_lock); + return (evict_needed); +} + +/* * Evict buffers from list until we've removed the specified number of * bytes. Move the removed buffers to the appropriate evict state. * If the recycle flag is set, then attempt to "recycle" a buffer: @@ -1657,7 +1772,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, hash_lock = HDR_LOCK(ab); have_lock = MUTEX_HELD(hash_lock); if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); + ASSERT0(refcount_count(&ab->b_refcnt)); ASSERT(ab->b_datacnt > 0); while (ab->b_buf) { arc_buf_t *buf = ab->b_buf; @@ -1909,6 +2024,48 @@ arc_adjust(void) } } +/* + * Request that arc user drop references so that N bytes can be released + * from the cache. This provides a mechanism to ensure the arc can honor + * the arc_meta_limit and reclaim buffers which are pinned in the cache + * by higher layers. (i.e. the zpl) + */ +static void +arc_do_user_prune(int64_t adjustment) +{ + arc_prune_func_t *func; + void *private; + arc_prune_t *cp, *np; + + mutex_enter(&arc_prune_mtx); + + cp = list_head(&arc_prune_list); + while (cp != NULL) { + func = cp->p_pfunc; + private = cp->p_private; + np = list_next(&arc_prune_list, cp); + refcount_add(&cp->p_refcnt, func); + mutex_exit(&arc_prune_mtx); + + if (func != NULL) + func(adjustment, private); + + mutex_enter(&arc_prune_mtx); + + /* User removed prune callback concurrently with execution */ + if (refcount_remove(&cp->p_refcnt, func) == 0) { + ASSERT(!list_link_active(&cp->p_node)); + refcount_destroy(&cp->p_refcnt); + kmem_free(cp, sizeof (*cp)); + } + + cp = np; + } + + ARCSTAT_BUMP(arcstat_prune); + mutex_exit(&arc_prune_mtx); +} + static void arc_do_user_evicts(void) { @@ -1933,6 +2090,32 @@ arc_do_user_evicts(void) } /* + * Evict only meta data objects from the cache leaving the data objects. + * This is only used to enforce the tunable arc_meta_limit, if we are + * unable to evict enough buffers notify the user via the prune callback. + */ +void +arc_adjust_meta(int64_t adjustment, boolean_t may_prune) +{ + int64_t delta; + + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); + arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); + adjustment -= delta; + } + + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment); + arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); + adjustment -= delta; + } + + if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit)) + arc_do_user_prune(arc_meta_prune); +} + +/* * Flush all *evictable* data from the cache for the given spa. * NOTE: this will not touch "active" (i.e. referenced) data. */ @@ -1942,7 +2125,7 @@ arc_flush(spa_t *spa) uint64_t guid = 0; if (spa) - guid = spa_guid(spa); + guid = spa_load_guid(spa); while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); @@ -1975,16 +2158,13 @@ arc_flush(spa_t *spa) } void -arc_shrink(void) +arc_shrink(uint64_t bytes) { if (arc_c > arc_c_min) { uint64_t to_free; -#ifdef _KERNEL - to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); -#else - to_free = arc_c >> arc_shrink_shift; -#endif + to_free = bytes ? bytes : arc_c >> arc_shrink_shift; + if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else @@ -2003,66 +2183,8 @@ arc_shrink(void) arc_adjust(); } -static int -arc_reclaim_needed(void) -{ -#ifdef _KERNEL - uint64_t extra; - - if (needfree) - return (1); - - /* - * take 'desfree' extra pages, so we reclaim sooner, rather than later - */ - extra = desfree; - - /* - * check that we're out of range of the pageout scanner. It starts to - * schedule paging if freemem is less than lotsfree and needfree. - * lotsfree is the high-water mark for pageout, and needfree is the - * number of needed free pages. We add extra pages here to make sure - * the scanner doesn't start up while we're freeing memory. - */ - if (freemem < lotsfree + needfree + extra) - return (1); - - /* - * check to make sure that swapfs has enough space so that anon - * reservations can still succeed. anon_resvmem() checks that the - * availrmem is greater than swapfs_minfree, and the number of reserved - * swap pages. We also add a bit of extra here just to prevent - * circumstances from getting really dire. - */ - if (availrmem < swapfs_minfree + swapfs_reserve + extra) - return (1); - -#if defined(__i386) - /* - * If we're on an i386 platform, it's possible that we'll exhaust the - * kernel heap space before we ever run out of available physical - * memory. Most checks of the size of the heap_area compare against - * tune.t_minarmem, which is the minimum available real memory that we - * can have in the system. However, this is generally fixed at 25 pages - * which is so low that it's useless. In this comparison, we seek to - * calculate the total heap-size, and reclaim if more than 3/4ths of the - * heap is allocated. (Or, in the calculation, if less than 1/4th is - * free) - */ - if (btop(vmem_size(heap_arena, VMEM_FREE)) < - (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) - return (1); -#endif - -#else - if (spa_get_random(100) == 0) - return (1); -#endif - return (0); -} - static void -arc_kmem_reap_now(arc_reclaim_strategy_t strat) +arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) { size_t i; kmem_cache_t *prev_cache = NULL; @@ -2070,28 +2192,12 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; -#ifdef _KERNEL - if (arc_meta_used >= arc_meta_limit) { - /* - * We are exceeding our meta-data cache limit. - * Purge some DNLC entries to release holds on meta-data. - */ - dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); - } -#if defined(__i386) - /* - * Reclaim unused memory from all kmem caches. - */ - kmem_reap(); -#endif -#endif - /* * An aggressive reclamation will shrink the cache size as well as * reap free buffers from the arc kmem caches. */ if (strat == ARC_RECLAIM_AGGR) - arc_shrink(); + arc_shrink(bytes); for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { @@ -2103,22 +2209,32 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) kmem_cache_reap_now(zio_data_buf_cache[i]); } } + kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_cache); } +/* + * Unlike other ZFS implementations this thread is only responsible for + * adapting the target ARC size on Linux. The responsibility for memory + * reclamation has been entirely delegated to the arc_shrinker_func() + * which is registered with the VM. To reflect this change in behavior + * the arc_reclaim thread has been renamed to arc_adapt. + */ static void -arc_reclaim_thread(void) +arc_adapt_thread(void) { - clock_t growtime = 0; - arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; callb_cpr_t cpr; + int64_t prune; CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_reclaim_thr_lock); while (arc_thread_exit == 0) { - if (arc_reclaim_needed()) { +#ifndef _KERNEL + arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; + + if (spa_get_random(100) == 0) { if (arc_no_grow) { if (last_reclaim == ARC_RECLAIM_CONS) { @@ -2133,14 +2249,25 @@ arc_reclaim_thread(void) } /* reset the growth delay for every reclaim */ - growtime = ddi_get_lbolt() + (arc_grow_retry * hz); + arc_grow_time = ddi_get_lbolt()+(arc_grow_retry * hz); - arc_kmem_reap_now(last_reclaim); + arc_kmem_reap_now(last_reclaim, 0); arc_warm = B_TRUE; + } +#endif /* !_KERNEL */ - } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { + /* No recent memory pressure allow the ARC to grow. */ + if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time) arc_no_grow = FALSE; - } + + /* + * Keep meta data usage within limits, arc_shrink() is not + * used to avoid collapsing the arc_c value when only the + * arc_meta_limit is being exceeded. + */ + prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit; + if (prune > 0) + arc_adjust_meta(prune, B_TRUE); arc_adjust(); @@ -2149,7 +2276,7 @@ arc_reclaim_thread(void) /* block until needed, or one second, whichever is shorter */ CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&arc_reclaim_thr_cv, + (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); } @@ -2160,6 +2287,131 @@ arc_reclaim_thread(void) thread_exit(); } +#ifdef _KERNEL +/* + * Determine the amount of memory eligible for eviction contained in the + * ARC. All clean data reported by the ghost lists can always be safely + * evicted. Due to arc_c_min, the same does not hold for all clean data + * contained by the regular mru and mfu lists. + * + * In the case of the regular mru and mfu lists, we need to report as + * much clean data as possible, such that evicting that same reported + * data will not bring arc_size below arc_c_min. Thus, in certain + * circumstances, the total amount of clean data in the mru and mfu + * lists might not actually be evictable. + * + * The following two distinct cases are accounted for: + * + * 1. The sum of the amount of dirty data contained by both the mru and + * mfu lists, plus the ARC's other accounting (e.g. the anon list), + * is greater than or equal to arc_c_min. + * (i.e. amount of dirty data >= arc_c_min) + * + * This is the easy case; all clean data contained by the mru and mfu + * lists is evictable. Evicting all clean data can only drop arc_size + * to the amount of dirty data, which is greater than arc_c_min. + * + * 2. The sum of the amount of dirty data contained by both the mru and + * mfu lists, plus the ARC's other accounting (e.g. the anon list), + * is less than arc_c_min. + * (i.e. arc_c_min > amount of dirty data) + * + * 2.1. arc_size is greater than or equal arc_c_min. + * (i.e. arc_size >= arc_c_min > amount of dirty data) + * + * In this case, not all clean data from the regular mru and mfu + * lists is actually evictable; we must leave enough clean data + * to keep arc_size above arc_c_min. Thus, the maximum amount of + * evictable data from the two lists combined, is exactly the + * difference between arc_size and arc_c_min. + * + * 2.2. arc_size is less than arc_c_min + * (i.e. arc_c_min > arc_size > amount of dirty data) + * + * In this case, none of the data contained in the mru and mfu + * lists is evictable, even if it's clean. Since arc_size is + * already below arc_c_min, evicting any more would only + * increase this negative difference. + */ +static uint64_t +arc_evictable_memory(void) { + uint64_t arc_clean = + arc_mru->arcs_lsize[ARC_BUFC_DATA] + + arc_mru->arcs_lsize[ARC_BUFC_METADATA] + + arc_mfu->arcs_lsize[ARC_BUFC_DATA] + + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; + uint64_t ghost_clean = + arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] + + arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] + + arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] + + arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]; + uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0); + + if (arc_dirty >= arc_c_min) + return (ghost_clean + arc_clean); + + return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0)); +} + +static int +__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) +{ + uint64_t pages; + + /* The arc is considered warm once reclaim has occurred */ + if (unlikely(arc_warm == B_FALSE)) + arc_warm = B_TRUE; + + /* Return the potential number of reclaimable pages */ + pages = btop(arc_evictable_memory()); + if (sc->nr_to_scan == 0) + return (pages); + + /* Not allowed to perform filesystem reclaim */ + if (!(sc->gfp_mask & __GFP_FS)) + return (-1); + + /* Reclaim in progress */ + if (mutex_tryenter(&arc_reclaim_thr_lock) == 0) + return (-1); + + /* + * Evict the requested number of pages by shrinking arc_c the + * requested amount. If there is nothing left to evict just + * reap whatever we can from the various arc slabs. + */ + if (pages > 0) { + arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan)); + pages = btop(arc_evictable_memory()); + } else { + arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan)); + pages = -1; + } + + /* + * When direct reclaim is observed it usually indicates a rapid + * increase in memory pressure. This occurs because the kswapd + * threads were unable to asynchronously keep enough free memory + * available. In this case set arc_no_grow to briefly pause arc + * growth to avoid compounding the memory pressure. + */ + if (current_is_kswapd()) { + ARCSTAT_BUMP(arcstat_memory_indirect_count); + } else { + arc_no_grow = B_TRUE; + arc_grow_time = ddi_get_lbolt() + (arc_grow_retry * hz); + ARCSTAT_BUMP(arcstat_memory_direct_count); + } + + mutex_exit(&arc_reclaim_thr_lock); + + return (pages); +} +SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func); + +SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS); +#endif /* _KERNEL */ + /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called @@ -2201,11 +2453,6 @@ arc_adapt(int bytes, arc_state_t *state) } ASSERT((int64_t)arc_p >= 0); - if (arc_reclaim_needed()) { - cv_signal(&arc_reclaim_thr_cv); - return; - } - if (arc_no_grow) return; @@ -2238,19 +2485,7 @@ arc_evict_needed(arc_buf_contents_t type) if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) return (1); -#ifdef _KERNEL - /* - * If zio data pages are being allocated out of a separate heap segment, - * then enforce that the size of available vmem for this area remains - * above about 1/32nd free. - */ - if (type == ARC_BUFC_DATA && zio_arena != NULL && - vmem_size(zio_arena, VMEM_FREE) < - (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) - return (1); -#endif - - if (arc_reclaim_needed()) + if (arc_no_grow) return (1); return (arc_size > arc_c); @@ -2324,16 +2559,27 @@ arc_get_data_buf(arc_buf_t *buf) state = (arc_mru->arcs_lsize[type] >= size && mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } + if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); + + /* + * If we are unable to recycle an existing meta buffer + * signal the reclaim thread. It will notify users + * via the prune callback to drop references. The + * prune callback in run in the context of the reclaim + * thread to avoid deadlocking on the hash_lock. + */ + cv_signal(&arc_reclaim_thr_cv); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } + ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); @@ -2472,7 +2718,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); + ASSERT0(refcount_count(&buf->b_refcnt)); new_state = arc_mru; } @@ -2554,10 +2800,12 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { - arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? - byteswap_uint64_array : - dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; - func(buf->b_data, hdr->b_size); + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + if (BP_GET_LEVEL(zio->io_bp) > 0) + byteswap_uint64_array(buf->b_data, hdr->b_size); + else + dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size); } arc_cksum_compute(buf, B_FALSE); @@ -2576,8 +2824,10 @@ arc_read_done(zio_t *zio) abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done) { - if (abuf == NULL) + if (abuf == NULL) { + ARCSTAT_BUMP(arcstat_duplicate_reads); abuf = arc_buf_clone(buf); + } acb->acb_buf = abuf; abuf = NULL; } @@ -2641,7 +2891,7 @@ arc_read_done(zio_t *zio) } /* - * "Read" the block block at the specified DVA (in bp) via the + * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was @@ -2657,48 +2907,17 @@ arc_read_done(zio_t *zio) * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. - * - * Normal callers should use arc_read and pass the arc buffer and offset - * for the bp. But if you know you don't need locking, you can use - * arc_read_bp. */ int -arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb) -{ - int err; - - if (pbuf == NULL) { - /* - * XXX This happens from traverse callback funcs, for - * the objset_phys_t block. - */ - return (arc_read_nolock(pio, spa, bp, done, private, priority, - zio_flags, arc_flags, zb)); - } - - ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); - ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); - rw_enter(&pbuf->b_data_lock, RW_READER); - - err = arc_read_nolock(pio, spa, bp, done, private, priority, - zio_flags, arc_flags, zb); - rw_exit(&pbuf->b_data_lock); - - return (err); -} - -int -arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb) +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, + void *private, int priority, int zio_flags, uint32_t *arc_flags, + const zbookmark_t *zb) { arc_buf_hdr_t *hdr; arc_buf_t *buf = NULL; kmutex_t *hash_lock; zio_t *rzio; - uint64_t guid = spa_guid(spa); + uint64_t guid = spa_load_guid(spa); top: hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), @@ -2720,7 +2939,7 @@ top: arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), - KM_SLEEP); + KM_PUSHPAGE); acb->acb_done = done; acb->acb_private = private; if (pio != NULL) @@ -2811,7 +3030,7 @@ top: /* this block is in the ghost cache */ ASSERT(GHOST_STATE(hdr->b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); + ASSERT0(refcount_count(&hdr->b_refcnt)); ASSERT(hdr->b_buf == NULL); /* if this is a prefetch, we don't have a reference */ @@ -2836,7 +3055,7 @@ top: ASSERT(!GHOST_STATE(hdr->b_state)); - acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); + acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE); acb->acb_done = done; acb->acb_private = private; @@ -2885,7 +3104,7 @@ top: ARCSTAT_BUMP(arcstat_l2_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), - KM_SLEEP); + KM_PUSHPAGE); cb->l2rcb_buf = buf; cb->l2rcb_spa = spa; cb->l2rcb_bp = *bp; @@ -2946,6 +3165,37 @@ top: return (0); } +arc_prune_t * +arc_add_prune_callback(arc_prune_func_t *func, void *private) +{ + arc_prune_t *p; + + p = kmem_alloc(sizeof(*p), KM_SLEEP); + p->p_pfunc = func; + p->p_private = private; + list_link_init(&p->p_node); + refcount_create(&p->p_refcnt); + + mutex_enter(&arc_prune_mtx); + refcount_add(&p->p_refcnt, &arc_prune_list); + list_insert_head(&arc_prune_list, p); + mutex_exit(&arc_prune_mtx); + + return (p); +} + +void +arc_remove_prune_callback(arc_prune_t *p) +{ + mutex_enter(&arc_prune_mtx); + list_remove(&arc_prune_list, p); + if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) { + refcount_destroy(&p->p_refcnt); + kmem_free(p, sizeof (*p)); + } + mutex_exit(&arc_prune_mtx); +} + void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { @@ -2960,6 +3210,34 @@ arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) } /* + * Notify the arc that a block was freed, and thus will never be used again. + */ +void +arc_freed(spa_t *spa, const blkptr_t *bp) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + uint64_t guid = spa_load_guid(spa); + + hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), + &hash_lock); + if (hdr == NULL) + return; + if (HDR_BUF_AVAILABLE(hdr)) { + arc_buf_t *buf = hdr->b_buf; + add_reference(hdr, hash_lock, FTAG); + hdr->b_flags &= ~ARC_BUF_AVAILABLE; + mutex_exit(hash_lock); + + arc_release(buf, FTAG); + (void) arc_buf_remove_ref(buf, FTAG); + } else { + mutex_exit(hash_lock); + } + +} + +/* * This is used by the DMU to let the ARC know that a buffer is * being evicted, so the ARC should clean up. If this arc buf * is not yet in the evicted state, it will be put there. @@ -3116,6 +3394,16 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } + + /* + * We're releasing a duplicate user data buffer, update + * our statistics accordingly. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, + -hdr->b_size); + } hdr->b_datacnt -= 1; arc_cksum_verify(buf); @@ -3156,24 +3444,12 @@ arc_release(arc_buf_t *buf, void *tag) if (l2hdr) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -buf_size); mutex_exit(&l2arc_buflist_mtx); } } -/* - * Release this buffer. If it does not match the provided BP, fill it - * with that block's contents. - */ -/* ARGSUSED */ -int -arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, - zbookmark_t *zb) -{ - arc_release(buf, tag); - return (0); -} - int arc_released(arc_buf_t *buf) { @@ -3325,7 +3601,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(hdr->b_acb == NULL); if (l2arc) hdr->b_flags |= ARC_L2CACHE; - callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); + callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE); callback->awcb_ready = ready; callback->awcb_done = done; callback->awcb_private = private; @@ -3341,50 +3617,23 @@ static int arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) { #ifdef _KERNEL - uint64_t available_memory = ptob(freemem); - static uint64_t page_load = 0; - static uint64_t last_txg = 0; + uint64_t available_memory; -#if defined(__i386) - available_memory = - MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); -#endif - if (available_memory >= zfs_write_limit_max) + if (zfs_arc_memory_throttle_disable) return (0); - if (txg > last_txg) { - last_txg = txg; - page_load = 0; - } - /* - * If we are in pageout, we know that memory is already tight, - * the arc is already going to be evicting, so we just want to - * continue to let page writes occur as quickly as possible. - */ - if (curproc == proc_pageout) { - if (page_load > MAX(ptob(minfree), available_memory) / 4) - return (ERESTART); - /* Note: reserve is inflated, so we deflate */ - page_load += reserve / 8; - return (0); - } else if (page_load > 0 && arc_reclaim_needed()) { - /* memory is low, delay before restarting */ + /* Easily reclaimable memory (free + inactive + arc-evictable) */ + available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory(); + + if (available_memory <= zfs_write_limit_max) { ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); return (EAGAIN); } - page_load = 0; - - if (arc_size > arc_c_min) { - uint64_t evictable_memory = - arc_mru->arcs_lsize[ARC_BUFC_DATA] + - arc_mru->arcs_lsize[ARC_BUFC_METADATA] + - arc_mfu->arcs_lsize[ARC_BUFC_DATA] + - arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; - available_memory += MIN(evictable_memory, arc_size - arc_c_min); - } if (inflight_data > available_memory / 4) { ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + DMU_TX_STAT_BUMP(dmu_tx_memory_inflight); return (ERESTART); } #endif @@ -3415,8 +3664,10 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) #endif if (reserve > arc_c/4 && !arc_no_grow) arc_c = MIN(arc_c_max, reserve * 4); - if (reserve > arc_c) + if (reserve > arc_c) { + DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); return (ENOMEM); + } /* * Don't count loaned bufs as in flight dirty data to prevent long @@ -3449,12 +3700,55 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, reserve>>10, arc_c>>10); + DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (ERESTART); } atomic_add_64(&arc_tempreserve, reserve); return (0); } +static void +arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, + kstat_named_t *evict_data, kstat_named_t *evict_metadata) +{ + size->value.ui64 = state->arcs_size; + evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; + evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; +} + +static int +arc_kstat_update(kstat_t *ksp, int rw) +{ + arc_stats_t *as = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + return (EACCES); + } else { + arc_kstat_update_state(arc_anon, + &as->arcstat_anon_size, + &as->arcstat_anon_evict_data, + &as->arcstat_anon_evict_metadata); + arc_kstat_update_state(arc_mru, + &as->arcstat_mru_size, + &as->arcstat_mru_evict_data, + &as->arcstat_mru_evict_metadata); + arc_kstat_update_state(arc_mru_ghost, + &as->arcstat_mru_ghost_size, + &as->arcstat_mru_ghost_evict_data, + &as->arcstat_mru_ghost_evict_metadata); + arc_kstat_update_state(arc_mfu, + &as->arcstat_mfu_size, + &as->arcstat_mfu_evict_data, + &as->arcstat_mfu_evict_metadata); + arc_kstat_update_state(arc_mfu_ghost, + &as->arcstat_mfu_ghost_size, + &as->arcstat_mfu_ghost_evict_data, + &as->arcstat_mfu_ghost_evict_metadata); + } + + return (0); +} + void arc_init(void) { @@ -3474,16 +3768,18 @@ arc_init(void) * need to limit the cache to 1/8 of VM size. */ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); + /* + * Register a shrinker to support synchronous (direct) memory + * reclaim from the arc. This is done to prevent kswapd from + * swapping out pages when it is preferable to shrink the arc. + */ + spl_register_shrinker(&arc_shrinker); #endif /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ arc_c_min = MAX(arc_c / 4, 64<<20); - /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ - if (arc_c * 8 >= 1<<30) - arc_c_max = (arc_c * 8) - (1<<30); - else - arc_c_max = arc_c_min; - arc_c_max = MAX(arc_c * 6, arc_c_max); + /* set max to 1/2 of all memory */ + arc_c_max = MAX(arc_c * 4, arc_c_max); /* * Allow the tunables to override our calculations if they are @@ -3499,6 +3795,7 @@ arc_init(void) /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; + arc_meta_max = 0; /* Allow the tunable to override if it is reasonable */ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) @@ -3516,6 +3813,9 @@ arc_init(void) if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift; + if (zfs_arc_meta_prune > 0) + arc_meta_prune = zfs_arc_meta_prune; + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -3561,7 +3861,10 @@ arc_init(void) buf_init(); arc_thread_exit = 0; + list_create(&arc_prune_list, sizeof (arc_prune_t), + offsetof(arc_prune_t, p_node)); arc_eviction_list = NULL; + mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); @@ -3570,10 +3873,11 @@ arc_init(void) if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; + arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } - (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, + (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0, TS_RUN, minclsyspri); arc_dead = FALSE; @@ -3589,7 +3893,13 @@ arc_init(void) void arc_fini(void) { + arc_prune_t *p; + mutex_enter(&arc_reclaim_thr_lock); +#ifdef _KERNEL + spl_unregister_shrinker(&arc_shrinker); +#endif /* _KERNEL */ + arc_thread_exit = 1; while (arc_thread_exit != 0) cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); @@ -3604,6 +3914,17 @@ arc_fini(void) arc_ksp = NULL; } + mutex_enter(&arc_prune_mtx); + while ((p = list_head(&arc_prune_list)) != NULL) { + list_remove(&arc_prune_list, p); + refcount_remove(&p->p_refcnt, &arc_prune_list); + refcount_destroy(&p->p_refcnt); + kmem_free(p, sizeof (*p)); + } + mutex_exit(&arc_prune_mtx); + + list_destroy(&arc_prune_list); + mutex_destroy(&arc_prune_mtx); mutex_destroy(&arc_eviction_mtx); mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); @@ -3823,14 +4144,14 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) static void l2arc_hdr_stat_add(void) { - ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); + ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE); ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); } static void l2arc_hdr_stat_remove(void) { - ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); + ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE); ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); } @@ -3974,6 +4295,7 @@ l2arc_write_done(zio_t *zio) abl2 = ab->b_l2hdr; ab->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } @@ -4220,6 +4542,7 @@ top: abl2 = ab->b_l2hdr; ab->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } list_remove(buflist, ab); @@ -4256,7 +4579,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) boolean_t have_lock, full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; - uint64_t guid = spa_guid(spa); + uint64_t guid = spa_load_guid(spa); int try; ASSERT(dev->l2ad_vdev != NULL); @@ -4330,8 +4653,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) */ list_insert_head(dev->l2ad_buflist, head); - cb = kmem_alloc( - sizeof (l2arc_write_callback_t), KM_SLEEP); + cb = kmem_alloc(sizeof (l2arc_write_callback_t), + KM_PUSHPAGE); cb->l2wcb_dev = dev; cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, @@ -4341,9 +4664,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) /* * Create and add a new L2ARC header. */ - hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); + hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), + KM_PUSHPAGE); hdrl2->b_dev = dev; hdrl2->b_daddr = dev->l2ad_hand; + arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS); ab->b_flags |= ARC_L2_WRITING; ab->b_l2hdr = hdrl2; @@ -4386,7 +4711,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) mutex_exit(&l2arc_buflist_mtx); if (pio == NULL) { - ASSERT3U(write_sz, ==, 0); + ASSERT0(write_sz); kmem_cache_free(hdr_cache, head); return (0); } @@ -4435,8 +4760,8 @@ l2arc_feed_thread(void) while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, - next); + (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv, + &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; @@ -4480,7 +4805,7 @@ l2arc_feed_thread(void) /* * Avoid contributing to memory pressure. */ - if (arc_reclaim_needed()) { + if (arc_no_grow) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; @@ -4685,13 +5010,58 @@ l2arc_stop(void) EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_remove_ref); EXPORT_SYMBOL(arc_getbuf_func); +EXPORT_SYMBOL(arc_add_prune_callback); +EXPORT_SYMBOL(arc_remove_prune_callback); -module_param(zfs_arc_min, ulong, 0644); -MODULE_PARM_DESC(zfs_arc_min, "Minimum arc size"); +module_param(zfs_arc_min, ulong, 0444); +MODULE_PARM_DESC(zfs_arc_min, "Min arc size"); -module_param(zfs_arc_max, ulong, 0644); -MODULE_PARM_DESC(zfs_arc_max, "Maximum arc size"); +module_param(zfs_arc_max, ulong, 0444); +MODULE_PARM_DESC(zfs_arc_max, "Max arc size"); -module_param(zfs_arc_meta_limit, ulong, 0644); +module_param(zfs_arc_meta_limit, ulong, 0444); MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size"); + +module_param(zfs_arc_meta_prune, int, 0444); +MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune"); + +module_param(zfs_arc_grow_retry, int, 0444); +MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); + +module_param(zfs_arc_shrink_shift, int, 0444); +MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)"); + +module_param(zfs_arc_p_min_shift, int, 0444); +MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p"); + +module_param(zfs_disable_dup_eviction, int, 0644); +MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction"); + +module_param(zfs_arc_memory_throttle_disable, int, 0644); +MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle"); + +module_param(l2arc_write_max, ulong, 0444); +MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval"); + +module_param(l2arc_write_boost, ulong, 0444); +MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup"); + +module_param(l2arc_headroom, ulong, 0444); +MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache"); + +module_param(l2arc_feed_secs, ulong, 0444); +MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing"); + +module_param(l2arc_feed_min_ms, ulong, 0444); +MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds"); + +module_param(l2arc_noprefetch, int, 0444); +MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers"); + +module_param(l2arc_feed_again, int, 0444); +MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup"); + +module_param(l2arc_norw, int, 0444); +MODULE_PARM_DESC(l2arc_norw, "No reads during writes"); + #endif