*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
*/
/*
* protected from simultaneous callbacks from arc_buf_evict()
* and arc_do_user_evicts().
*
+ * It as also possible to register a callback which is run when the
+ * arc_meta_limit is reached and no buffers can be safely evicted. In
+ * this case the arc user should drop a reference on some arc buffers so
+ * they can be reclaimed and the arc_meta_limit honored. For example,
+ * when using the ZPL each dentry holds a references on a znode. These
+ * dentries must be pruned before the arc buffer holding the znode can
+ * be safely evicted.
+ *
* Note that the majority of the performance stats are manipulated
* with atomic operations.
*
#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/arc.h>
-#include <sys/refcount.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <vm/anon.h>
#include <sys/fs/swapnode.h>
-#include <sys/dnlc.h>
+#include <sys/zpl.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
+#include <sys/dmu_tx.h>
#include <zfs_fletcher.h>
static kmutex_t arc_reclaim_thr_lock;
extern uint64_t zfs_write_limit_max;
extern kmutex_t zfs_write_limit_lock;
-#define ARC_REDUCE_DNLC_PERCENT 3
-uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+/* number of bytes to prune from caches when at arc_meta_limit is reached */
+uint_t arc_meta_prune = 1048576;
typedef enum arc_reclaim_strategy {
ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
} arc_reclaim_strategy_t;
/* number of seconds before growing cache again */
-static int arc_grow_retry = 60;
+static int arc_grow_retry = 5;
+
+/* expiration time for arc_no_grow */
+static clock_t arc_grow_time = 0;
/* shift of arc_c for calculating both min and max arc_p */
static int arc_p_min_shift = 4;
/*
* These tunables are for performance analysis.
*/
-uint64_t zfs_arc_max;
-uint64_t zfs_arc_min;
-uint64_t zfs_arc_meta_limit = 0;
+unsigned long zfs_arc_max = 0;
+unsigned long zfs_arc_min = 0;
+unsigned long zfs_arc_meta_limit = 0;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
+int zfs_arc_meta_prune = 0;
/*
* Note that buffers can be in one of 6 states:
kstat_named_t arcstat_hdr_size;
kstat_named_t arcstat_data_size;
kstat_named_t arcstat_other_size;
+ kstat_named_t arcstat_anon_size;
+ kstat_named_t arcstat_anon_evict_data;
+ kstat_named_t arcstat_anon_evict_metadata;
+ kstat_named_t arcstat_mru_size;
+ kstat_named_t arcstat_mru_evict_data;
+ kstat_named_t arcstat_mru_evict_metadata;
+ kstat_named_t arcstat_mru_ghost_size;
+ kstat_named_t arcstat_mru_ghost_evict_data;
+ kstat_named_t arcstat_mru_ghost_evict_metadata;
+ kstat_named_t arcstat_mfu_size;
+ kstat_named_t arcstat_mfu_evict_data;
+ kstat_named_t arcstat_mfu_evict_metadata;
+ kstat_named_t arcstat_mfu_ghost_size;
+ kstat_named_t arcstat_mfu_ghost_evict_data;
+ kstat_named_t arcstat_mfu_ghost_evict_metadata;
kstat_named_t arcstat_l2_hits;
kstat_named_t arcstat_l2_misses;
kstat_named_t arcstat_l2_feeds;
kstat_named_t arcstat_l2_size;
kstat_named_t arcstat_l2_hdr_size;
kstat_named_t arcstat_memory_throttle_count;
+ kstat_named_t arcstat_memory_direct_count;
+ kstat_named_t arcstat_memory_indirect_count;
+ kstat_named_t arcstat_no_grow;
+ kstat_named_t arcstat_tempreserve;
+ kstat_named_t arcstat_loaned_bytes;
+ kstat_named_t arcstat_prune;
+ kstat_named_t arcstat_meta_used;
+ kstat_named_t arcstat_meta_limit;
+ kstat_named_t arcstat_meta_max;
} arc_stats_t;
static arc_stats_t arc_stats = {
{ "hdr_size", KSTAT_DATA_UINT64 },
{ "data_size", KSTAT_DATA_UINT64 },
{ "other_size", KSTAT_DATA_UINT64 },
+ { "anon_size", KSTAT_DATA_UINT64 },
+ { "anon_evict_data", KSTAT_DATA_UINT64 },
+ { "anon_evict_metadata", KSTAT_DATA_UINT64 },
+ { "mru_size", KSTAT_DATA_UINT64 },
+ { "mru_evict_data", KSTAT_DATA_UINT64 },
+ { "mru_evict_metadata", KSTAT_DATA_UINT64 },
+ { "mru_ghost_size", KSTAT_DATA_UINT64 },
+ { "mru_ghost_evict_data", KSTAT_DATA_UINT64 },
+ { "mru_ghost_evict_metadata", KSTAT_DATA_UINT64 },
+ { "mfu_size", KSTAT_DATA_UINT64 },
+ { "mfu_evict_data", KSTAT_DATA_UINT64 },
+ { "mfu_evict_metadata", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_size", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_evict_data", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_evict_metadata", KSTAT_DATA_UINT64 },
{ "l2_hits", KSTAT_DATA_UINT64 },
{ "l2_misses", KSTAT_DATA_UINT64 },
{ "l2_feeds", KSTAT_DATA_UINT64 },
{ "l2_io_error", KSTAT_DATA_UINT64 },
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
- { "memory_throttle_count", KSTAT_DATA_UINT64 }
+ { "memory_throttle_count", KSTAT_DATA_UINT64 },
+ { "memory_direct_count", KSTAT_DATA_UINT64 },
+ { "memory_indirect_count", KSTAT_DATA_UINT64 },
+ { "arc_no_grow", KSTAT_DATA_UINT64 },
+ { "arc_tempreserve", KSTAT_DATA_UINT64 },
+ { "arc_loaned_bytes", KSTAT_DATA_UINT64 },
+ { "arc_prune", KSTAT_DATA_UINT64 },
+ { "arc_meta_used", KSTAT_DATA_UINT64 },
+ { "arc_meta_limit", KSTAT_DATA_UINT64 },
+ { "arc_meta_max", KSTAT_DATA_UINT64 },
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
-
-static int arc_no_grow; /* Don't try to grow cache size */
-static uint64_t arc_tempreserve;
-static uint64_t arc_loaned_bytes;
-static uint64_t arc_meta_used;
-static uint64_t arc_meta_limit;
-static uint64_t arc_meta_max = 0;
+#define arc_no_grow ARCSTAT(arcstat_no_grow)
+#define arc_tempreserve ARCSTAT(arcstat_tempreserve)
+#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
+#define arc_meta_used ARCSTAT(arcstat_meta_used)
+#define arc_meta_limit ARCSTAT(arcstat_meta_limit)
+#define arc_meta_max ARCSTAT(arcstat_meta_max)
typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
list_node_t b_l2node;
};
+static list_t arc_prune_list;
+static kmutex_t arc_prune_mtx;
static arc_buf_t *arc_eviction_list;
static kmutex_t arc_eviction_mtx;
static arc_buf_hdr_t arc_eviction_hdr;
* Hash table routines
*/
-#define HT_LOCK_PAD 64
+#define HT_LOCK_ALIGN 64
+#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
struct ht_lock {
kmutex_t ht_lock;
#ifdef _KERNEL
- unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+ unsigned char pad[HT_LOCK_PAD];
#endif
};
/*
* L2ARC Performance Tunables
*/
-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
-uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
-boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
-boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
-boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
+unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
+unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */
+unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
+unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
+unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
+int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+int l2arc_feed_again = B_TRUE; /* turbo warmup */
+int l2arc_norw = B_TRUE; /* no reads during writes */
/*
* L2ARC Internals
{
int i;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+ /* Large allocations which do not require contiguous pages
+ * should be using vmem_free() in the linux kernel */
+ vmem_free(buf_hash_table.ht_table,
+ (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#else
kmem_free(buf_hash_table.ht_table,
(buf_hash_table.ht_mask + 1) * sizeof (void *));
+#endif
for (i = 0; i < BUF_LOCKS; i++)
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
kmem_cache_destroy(hdr_cache);
refcount_create(&buf->b_refcnt);
cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_link_init(&buf->b_arc_node);
+ list_link_init(&buf->b_l2node);
arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
return (0);
arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
}
-/*
- * Reclaim callback -- invoked when memory is low.
- */
-/* ARGSUSED */
-static void
-hdr_recl(void *unused)
-{
- dprintf("hdr_recl called\n");
- /*
- * umem calls the reclaim func when we destroy the buf cache,
- * which is after we do arc_fini().
- */
- if (!arc_dead)
- cv_signal(&arc_reclaim_thr_cv);
-}
-
static void
buf_init(void)
{
hsize <<= 1;
retry:
buf_hash_table.ht_mask = hsize - 1;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+ /* Large allocations which do not require contiguous pages
+ * should be using vmem_alloc() in the linux kernel */
+ buf_hash_table.ht_table =
+ vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+#else
buf_hash_table.ht_table =
kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+#endif
if (buf_hash_table.ht_table == NULL) {
ASSERT(hsize > (1ULL << 8));
hsize >>= 1;
}
hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
- 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+ 0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
mutex_exit(&buf->b_hdr->b_freeze_lock);
return;
}
- buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+ buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+ KM_PUSHPAGE);
fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
buf->b_hdr->b_freeze_cksum);
mutex_exit(&buf->b_hdr->b_freeze_lock);
ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
switch (type) {
+ default:
+ break;
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, space);
break;
ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
switch (type) {
+ default:
+ break;
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, -space);
break;
ASSERT(BUF_EMPTY(hdr));
hdr->b_size = size;
hdr->b_type = type;
- hdr->b_spa = spa_guid(spa);
+ hdr->b_spa = spa_load_guid(spa);
hdr->b_state = arc_anon;
hdr->b_arc_access = 0;
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
mutex_exit(&state->arcs_mtx);
if (bytes_evicted < bytes)
- dprintf("only evicted %lld bytes from %x",
+ dprintf("only evicted %lld bytes from %x\n",
(longlong_t)bytes_evicted, state);
if (skipped)
}
if (bytes_deleted < bytes)
- dprintf("only deleted %lld bytes from %p",
+ dprintf("only deleted %lld bytes from %p\n",
(longlong_t)bytes_deleted, state);
}
}
}
+/*
+ * Request that arc user drop references so that N bytes can be released
+ * from the cache. This provides a mechanism to ensure the arc can honor
+ * the arc_meta_limit and reclaim buffers which are pinned in the cache
+ * by higher layers. (i.e. the zpl)
+ */
+static void
+arc_do_user_prune(int64_t adjustment)
+{
+ arc_prune_func_t *func;
+ void *private;
+ arc_prune_t *cp, *np;
+
+ mutex_enter(&arc_prune_mtx);
+
+ cp = list_head(&arc_prune_list);
+ while (cp != NULL) {
+ func = cp->p_pfunc;
+ private = cp->p_private;
+ np = list_next(&arc_prune_list, cp);
+ refcount_add(&cp->p_refcnt, func);
+ mutex_exit(&arc_prune_mtx);
+
+ if (func != NULL)
+ func(adjustment, private);
+
+ mutex_enter(&arc_prune_mtx);
+
+ /* User removed prune callback concurrently with execution */
+ if (refcount_remove(&cp->p_refcnt, func) == 0) {
+ ASSERT(!list_link_active(&cp->p_node));
+ refcount_destroy(&cp->p_refcnt);
+ kmem_free(cp, sizeof (*cp));
+ }
+
+ cp = np;
+ }
+
+ ARCSTAT_BUMP(arcstat_prune);
+ mutex_exit(&arc_prune_mtx);
+}
+
static void
arc_do_user_evicts(void)
{
}
/*
+ * Evict only meta data objects from the cache leaving the data objects.
+ * This is only used to enforce the tunable arc_meta_limit, if we are
+ * unable to evict enough buffers notify the user via the prune callback.
+ */
+void
+arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
+{
+ int64_t delta;
+
+ if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+ arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
+ adjustment -= delta;
+ }
+
+ if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+ arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
+ adjustment -= delta;
+ }
+
+ if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
+ arc_do_user_prune(arc_meta_prune);
+}
+
+/*
* Flush all *evictable* data from the cache for the given spa.
* NOTE: this will not touch "active" (i.e. referenced) data.
*/
uint64_t guid = 0;
if (spa)
- guid = spa_guid(spa);
+ guid = spa_load_guid(spa);
while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
}
void
-arc_shrink(void)
+arc_shrink(uint64_t bytes)
{
if (arc_c > arc_c_min) {
uint64_t to_free;
-#ifdef _KERNEL
- to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
-#else
- to_free = arc_c >> arc_shrink_shift;
-#endif
+ to_free = bytes ? bytes : arc_c >> arc_shrink_shift;
+
if (arc_c > arc_c_min + to_free)
atomic_add_64(&arc_c, -to_free);
else
arc_adjust();
}
-static int
-arc_reclaim_needed(void)
-{
- uint64_t extra;
-
-#ifdef _KERNEL
-
- if (needfree)
- return (1);
-
- /*
- * take 'desfree' extra pages, so we reclaim sooner, rather than later
- */
- extra = desfree;
-
- /*
- * check that we're out of range of the pageout scanner. It starts to
- * schedule paging if freemem is less than lotsfree and needfree.
- * lotsfree is the high-water mark for pageout, and needfree is the
- * number of needed free pages. We add extra pages here to make sure
- * the scanner doesn't start up while we're freeing memory.
- */
- if (freemem < lotsfree + needfree + extra)
- return (1);
-
- /*
- * check to make sure that swapfs has enough space so that anon
- * reservations can still succeed. anon_resvmem() checks that the
- * availrmem is greater than swapfs_minfree, and the number of reserved
- * swap pages. We also add a bit of extra here just to prevent
- * circumstances from getting really dire.
- */
- if (availrmem < swapfs_minfree + swapfs_reserve + extra)
- return (1);
-
-#if defined(__i386)
- /*
- * If we're on an i386 platform, it's possible that we'll exhaust the
- * kernel heap space before we ever run out of available physical
- * memory. Most checks of the size of the heap_area compare against
- * tune.t_minarmem, which is the minimum available real memory that we
- * can have in the system. However, this is generally fixed at 25 pages
- * which is so low that it's useless. In this comparison, we seek to
- * calculate the total heap-size, and reclaim if more than 3/4ths of the
- * heap is allocated. (Or, in the calculation, if less than 1/4th is
- * free)
- */
- if (btop(vmem_size(heap_arena, VMEM_FREE)) <
- (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
- return (1);
-#endif
-
-#else
- if (spa_get_random(100) == 0)
- return (1);
-#endif
- return (0);
-}
-
static void
-arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
{
size_t i;
kmem_cache_t *prev_cache = NULL;
extern kmem_cache_t *zio_buf_cache[];
extern kmem_cache_t *zio_data_buf_cache[];
-#ifdef _KERNEL
- if (arc_meta_used >= arc_meta_limit) {
- /*
- * We are exceeding our meta-data cache limit.
- * Purge some DNLC entries to release holds on meta-data.
- */
- dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
- }
-#if defined(__i386)
- /*
- * Reclaim unused memory from all kmem caches.
- */
- kmem_reap();
-#endif
-#endif
-
/*
* An aggressive reclamation will shrink the cache size as well as
* reap free buffers from the arc kmem caches.
*/
if (strat == ARC_RECLAIM_AGGR)
- arc_shrink();
+ arc_shrink(bytes);
for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
if (zio_buf_cache[i] != prev_cache) {
kmem_cache_reap_now(zio_data_buf_cache[i]);
}
}
+
kmem_cache_reap_now(buf_cache);
kmem_cache_reap_now(hdr_cache);
}
+/*
+ * Unlike other ZFS implementations this thread is only responsible for
+ * adapting the target ARC size on Linux. The responsibility for memory
+ * reclamation has been entirely delegated to the arc_shrinker_func()
+ * which is registered with the VM. To reflect this change in behavior
+ * the arc_reclaim thread has been renamed to arc_adapt.
+ */
static void
-arc_reclaim_thread(void)
+arc_adapt_thread(void)
{
- clock_t growtime = 0;
- arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
callb_cpr_t cpr;
+ int64_t prune;
CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
mutex_enter(&arc_reclaim_thr_lock);
while (arc_thread_exit == 0) {
- if (arc_reclaim_needed()) {
+#ifndef _KERNEL
+ arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
+
+ if (spa_get_random(100) == 0) {
if (arc_no_grow) {
if (last_reclaim == ARC_RECLAIM_CONS) {
}
/* reset the growth delay for every reclaim */
- growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
+ arc_grow_time = ddi_get_lbolt()+(arc_grow_retry * hz);
- arc_kmem_reap_now(last_reclaim);
+ arc_kmem_reap_now(last_reclaim, 0);
arc_warm = B_TRUE;
+ }
+#endif /* !_KERNEL */
- } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
+ /* No recent memory pressure allow the ARC to grow. */
+ if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
arc_no_grow = FALSE;
- }
+
+ /*
+ * Keep meta data usage within limits, arc_shrink() is not
+ * used to avoid collapsing the arc_c value when only the
+ * arc_meta_limit is being exceeded.
+ */
+ prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
+ if (prune > 0)
+ arc_adjust_meta(prune, B_TRUE);
arc_adjust();
/* block until needed, or one second, whichever is shorter */
CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait(&arc_reclaim_thr_cv,
+ (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
&arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
}
thread_exit();
}
+#ifdef _KERNEL
+/*
+ * Determine the amount of memory eligible for eviction contained in the
+ * ARC. All clean data reported by the ghost lists can always be safely
+ * evicted. Due to arc_c_min, the same does not hold for all clean data
+ * contained by the regular mru and mfu lists.
+ *
+ * In the case of the regular mru and mfu lists, we need to report as
+ * much clean data as possible, such that evicting that same reported
+ * data will not bring arc_size below arc_c_min. Thus, in certain
+ * circumstances, the total amount of clean data in the mru and mfu
+ * lists might not actually be evictable.
+ *
+ * The following two distinct cases are accounted for:
+ *
+ * 1. The sum of the amount of dirty data contained by both the mru and
+ * mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ * is greater than or equal to arc_c_min.
+ * (i.e. amount of dirty data >= arc_c_min)
+ *
+ * This is the easy case; all clean data contained by the mru and mfu
+ * lists is evictable. Evicting all clean data can only drop arc_size
+ * to the amount of dirty data, which is greater than arc_c_min.
+ *
+ * 2. The sum of the amount of dirty data contained by both the mru and
+ * mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ * is less than arc_c_min.
+ * (i.e. arc_c_min > amount of dirty data)
+ *
+ * 2.1. arc_size is greater than or equal arc_c_min.
+ * (i.e. arc_size >= arc_c_min > amount of dirty data)
+ *
+ * In this case, not all clean data from the regular mru and mfu
+ * lists is actually evictable; we must leave enough clean data
+ * to keep arc_size above arc_c_min. Thus, the maximum amount of
+ * evictable data from the two lists combined, is exactly the
+ * difference between arc_size and arc_c_min.
+ *
+ * 2.2. arc_size is less than arc_c_min
+ * (i.e. arc_c_min > arc_size > amount of dirty data)
+ *
+ * In this case, none of the data contained in the mru and mfu
+ * lists is evictable, even if it's clean. Since arc_size is
+ * already below arc_c_min, evicting any more would only
+ * increase this negative difference.
+ */
+static uint64_t
+arc_evictable_memory(void) {
+ uint64_t arc_clean =
+ arc_mru->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
+ uint64_t ghost_clean =
+ arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
+ arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
+ uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
+
+ if (arc_dirty >= arc_c_min)
+ return (ghost_clean + arc_clean);
+
+ return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
+}
+
+static int
+__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
+{
+ uint64_t pages;
+
+ /* The arc is considered warm once reclaim has occurred */
+ if (unlikely(arc_warm == B_FALSE))
+ arc_warm = B_TRUE;
+
+ /* Return the potential number of reclaimable pages */
+ pages = btop(arc_evictable_memory());
+ if (sc->nr_to_scan == 0)
+ return (pages);
+
+ /* Not allowed to perform filesystem reclaim */
+ if (!(sc->gfp_mask & __GFP_FS))
+ return (-1);
+
+ /* Reclaim in progress */
+ if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
+ return (-1);
+
+ /*
+ * Evict the requested number of pages by shrinking arc_c the
+ * requested amount. If there is nothing left to evict just
+ * reap whatever we can from the various arc slabs.
+ */
+ if (pages > 0) {
+ arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
+ pages = btop(arc_evictable_memory());
+ } else {
+ arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
+ pages = -1;
+ }
+
+ /*
+ * When direct reclaim is observed it usually indicates a rapid
+ * increase in memory pressure. This occurs because the kswapd
+ * threads were unable to asynchronously keep enough free memory
+ * available. In this case set arc_no_grow to briefly pause arc
+ * growth to avoid compounding the memory pressure.
+ */
+ if (current_is_kswapd()) {
+ ARCSTAT_BUMP(arcstat_memory_indirect_count);
+ } else {
+ arc_no_grow = B_TRUE;
+ arc_grow_time = ddi_get_lbolt() + (arc_grow_retry * hz);
+ ARCSTAT_BUMP(arcstat_memory_direct_count);
+ }
+
+ mutex_exit(&arc_reclaim_thr_lock);
+
+ return (pages);
+}
+SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
+
+SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
+#endif /* _KERNEL */
+
/*
* Adapt arc info given the number of bytes we are trying to add and
* the state that we are comming from. This function is only called
}
ASSERT((int64_t)arc_p >= 0);
- if (arc_reclaim_needed()) {
- cv_signal(&arc_reclaim_thr_cv);
- return;
- }
-
if (arc_no_grow)
return;
return (1);
#endif
- if (arc_reclaim_needed())
+ if (arc_no_grow)
return (1);
return (arc_size > arc_c);
state = (arc_mru->arcs_lsize[type] >= size &&
mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
}
+
if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
arc_space_consume(size, ARC_SPACE_DATA);
+
+ /*
+ * If we are unable to recycle an existing meta buffer
+ * signal the reclaim thread. It will notify users
+ * via the prune callback to drop references. The
+ * prune callback in run in the context of the reclaim
+ * thread to avoid deadlocking on the hash_lock.
+ */
+ cv_signal(&arc_reclaim_thr_cv);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
ARCSTAT_INCR(arcstat_data_size, size);
atomic_add_64(&arc_size, size);
}
+
ARCSTAT_BUMP(arcstat_recycle_miss);
}
ASSERT(buf->b_data != NULL);
uint32_t *arc_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
- arc_buf_t *buf;
+ arc_buf_t *buf = NULL;
kmutex_t *hash_lock;
zio_t *rzio;
- uint64_t guid = spa_guid(spa);
+ uint64_t guid = spa_load_guid(spa);
top:
hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
arc_callback_t *acb = NULL;
acb = kmem_zalloc(sizeof (arc_callback_t),
- KM_SLEEP);
+ KM_PUSHPAGE);
acb->acb_done = done;
acb->acb_private = private;
if (pio != NULL)
uint64_t size = BP_GET_LSIZE(bp);
arc_callback_t *acb;
vdev_t *vd = NULL;
- uint64_t addr;
+ uint64_t addr = -1;
boolean_t devw = B_FALSE;
if (hdr == NULL) {
ASSERT(!GHOST_STATE(hdr->b_state));
- acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+ acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE);
acb->acb_done = done;
acb->acb_private = private;
ARCSTAT_BUMP(arcstat_l2_hits);
cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
- KM_SLEEP);
+ KM_PUSHPAGE);
cb->l2rcb_buf = buf;
cb->l2rcb_spa = spa;
cb->l2rcb_bp = *bp;
return (0);
}
+arc_prune_t *
+arc_add_prune_callback(arc_prune_func_t *func, void *private)
+{
+ arc_prune_t *p;
+
+ p = kmem_alloc(sizeof(*p), KM_SLEEP);
+ p->p_pfunc = func;
+ p->p_private = private;
+ list_link_init(&p->p_node);
+ refcount_create(&p->p_refcnt);
+
+ mutex_enter(&arc_prune_mtx);
+ refcount_add(&p->p_refcnt, &arc_prune_list);
+ list_insert_head(&arc_prune_list, p);
+ mutex_exit(&arc_prune_mtx);
+
+ return (p);
+}
+
+void
+arc_remove_prune_callback(arc_prune_t *p)
+{
+ mutex_enter(&arc_prune_mtx);
+ list_remove(&arc_prune_list, p);
+ if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
+ refcount_destroy(&p->p_refcnt);
+ kmem_free(p, sizeof (*p));
+ }
+ mutex_exit(&arc_prune_mtx);
+}
+
void
arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
{
arc_buf_hdr_t *hdr;
kmutex_t *hash_lock = NULL;
l2arc_buf_hdr_t *l2hdr;
- uint64_t buf_size;
+ uint64_t buf_size = 0;
/*
* It would be nice to assert that if it's DMU metadata (level >
arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
{
#ifdef _KERNEL
- uint64_t available_memory = ptob(freemem);
- static uint64_t page_load = 0;
- static uint64_t last_txg = 0;
+ uint64_t available_memory;
+ /* Easily reclaimable memory (free + inactive + arc-evictable) */
+ available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();
#if defined(__i386)
available_memory =
MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
#endif
- if (available_memory >= zfs_write_limit_max)
- return (0);
- if (txg > last_txg) {
- last_txg = txg;
- page_load = 0;
- }
- /*
- * If we are in pageout, we know that memory is already tight,
- * the arc is already going to be evicting, so we just want to
- * continue to let page writes occur as quickly as possible.
- */
- if (curproc == proc_pageout) {
- if (page_load > MAX(ptob(minfree), available_memory) / 4)
- return (ERESTART);
- /* Note: reserve is inflated, so we deflate */
- page_load += reserve / 8;
- return (0);
- } else if (page_load > 0 && arc_reclaim_needed()) {
- /* memory is low, delay before restarting */
+ if (available_memory <= zfs_write_limit_max) {
ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
return (EAGAIN);
}
- page_load = 0;
-
- if (arc_size > arc_c_min) {
- uint64_t evictable_memory =
- arc_mru->arcs_lsize[ARC_BUFC_DATA] +
- arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
- arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
- arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
- available_memory += MIN(evictable_memory, arc_size - arc_c_min);
- }
if (inflight_data > available_memory / 4) {
ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ DMU_TX_STAT_BUMP(dmu_tx_memory_inflight);
return (ERESTART);
}
#endif
#endif
if (reserve > arc_c/4 && !arc_no_grow)
arc_c = MIN(arc_c_max, reserve * 4);
- if (reserve > arc_c)
+ if (reserve > arc_c) {
+ DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
return (ENOMEM);
+ }
/*
* Don't count loaned bufs as in flight dirty data to prevent long
* in order to compress/encrypt/etc the data. We therefor need to
* make sure that there is sufficient available memory for this.
*/
- if (error = arc_memory_throttle(reserve, anon_size, txg))
+ if ((error = arc_memory_throttle(reserve, anon_size, txg)))
return (error);
/*
arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
reserve>>10, arc_c>>10);
+ DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
return (ERESTART);
}
atomic_add_64(&arc_tempreserve, reserve);
return (0);
}
+static void
+arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
+ kstat_named_t *evict_data, kstat_named_t *evict_metadata)
+{
+ size->value.ui64 = state->arcs_size;
+ evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
+ evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
+}
+
+static int
+arc_kstat_update(kstat_t *ksp, int rw)
+{
+ arc_stats_t *as = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE) {
+ return (EACCES);
+ } else {
+ arc_kstat_update_state(arc_anon,
+ &as->arcstat_anon_size,
+ &as->arcstat_anon_evict_data,
+ &as->arcstat_anon_evict_metadata);
+ arc_kstat_update_state(arc_mru,
+ &as->arcstat_mru_size,
+ &as->arcstat_mru_evict_data,
+ &as->arcstat_mru_evict_metadata);
+ arc_kstat_update_state(arc_mru_ghost,
+ &as->arcstat_mru_ghost_size,
+ &as->arcstat_mru_ghost_evict_data,
+ &as->arcstat_mru_ghost_evict_metadata);
+ arc_kstat_update_state(arc_mfu,
+ &as->arcstat_mfu_size,
+ &as->arcstat_mfu_evict_data,
+ &as->arcstat_mfu_evict_metadata);
+ arc_kstat_update_state(arc_mfu_ghost,
+ &as->arcstat_mfu_ghost_size,
+ &as->arcstat_mfu_ghost_evict_data,
+ &as->arcstat_mfu_ghost_evict_metadata);
+ }
+
+ return (0);
+}
+
void
arc_init(void)
{
* need to limit the cache to 1/8 of VM size.
*/
arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+ /*
+ * Register a shrinker to support synchronous (direct) memory
+ * reclaim from the arc. This is done to prevent kswapd from
+ * swapping out pages when it is preferable to shrink the arc.
+ */
+ spl_register_shrinker(&arc_shrinker);
#endif
/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
arc_c_min = MAX(arc_c / 4, 64<<20);
- /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
- if (arc_c * 8 >= 1<<30)
- arc_c_max = (arc_c * 8) - (1<<30);
- else
- arc_c_max = arc_c_min;
- arc_c_max = MAX(arc_c * 6, arc_c_max);
+ /* set max to 1/2 of all memory */
+ arc_c_max = MAX(arc_c * 4, arc_c_max);
/*
* Allow the tunables to override our calculations if they are
/* limit meta-data to 1/4 of the arc capacity */
arc_meta_limit = arc_c_max / 4;
+ arc_meta_max = 0;
/* Allow the tunable to override if it is reasonable */
if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
if (zfs_arc_p_min_shift > 0)
arc_p_min_shift = zfs_arc_p_min_shift;
+ if (zfs_arc_meta_prune > 0)
+ arc_meta_prune = zfs_arc_meta_prune;
+
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
arc_c = arc_c / 2;
buf_init();
arc_thread_exit = 0;
+ list_create(&arc_prune_list, sizeof (arc_prune_t),
+ offsetof(arc_prune_t, p_node));
arc_eviction_list = NULL;
+ mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
if (arc_ksp != NULL) {
arc_ksp->ks_data = &arc_stats;
+ arc_ksp->ks_update = arc_kstat_update;
kstat_install(arc_ksp);
}
- (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+ (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
TS_RUN, minclsyspri);
arc_dead = FALSE;
void
arc_fini(void)
{
+ arc_prune_t *p;
+
mutex_enter(&arc_reclaim_thr_lock);
+#ifdef _KERNEL
+ spl_unregister_shrinker(&arc_shrinker);
+#endif /* _KERNEL */
+
arc_thread_exit = 1;
while (arc_thread_exit != 0)
cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
arc_ksp = NULL;
}
+ mutex_enter(&arc_prune_mtx);
+ while ((p = list_head(&arc_prune_list)) != NULL) {
+ list_remove(&arc_prune_list, p);
+ refcount_remove(&p->p_refcnt, &arc_prune_list);
+ refcount_destroy(&p->p_refcnt);
+ kmem_free(p, sizeof (*p));
+ }
+ mutex_exit(&arc_prune_mtx);
+
+ list_destroy(&arc_prune_list);
+ mutex_destroy(&arc_prune_mtx);
mutex_destroy(&arc_eviction_mtx);
mutex_destroy(&arc_reclaim_thr_lock);
cv_destroy(&arc_reclaim_thr_cv);
static list_t *
l2arc_list_locked(int list_num, kmutex_t **lock)
{
- list_t *list;
+ list_t *list = NULL;
ASSERT(list_num >= 0 && list_num <= 3);
list_t *list;
uint64_t passed_sz, write_sz, buf_sz, headroom;
void *buf_data;
- kmutex_t *hash_lock, *list_lock;
+ kmutex_t *hash_lock, *list_lock = NULL;
boolean_t have_lock, full;
l2arc_write_callback_t *cb;
zio_t *pio, *wzio;
- uint64_t guid = spa_guid(spa);
+ uint64_t guid = spa_load_guid(spa);
int try;
ASSERT(dev->l2ad_vdev != NULL);
*/
list_insert_head(dev->l2ad_buflist, head);
- cb = kmem_alloc(
- sizeof (l2arc_write_callback_t), KM_SLEEP);
+ cb = kmem_alloc(sizeof (l2arc_write_callback_t),
+ KM_PUSHPAGE);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
pio = zio_root(spa, l2arc_write_done, cb,
/*
* Create and add a new L2ARC header.
*/
- hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+ hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
+ KM_PUSHPAGE);
hdrl2->b_dev = dev;
hdrl2->b_daddr = dev->l2ad_hand;
while (l2arc_thread_exit == 0) {
CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
- next);
+ (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
+ &l2arc_feed_thr_lock, next);
CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
next = ddi_get_lbolt() + hz;
/*
* Avoid contributing to memory pressure.
*/
- if (arc_reclaim_needed()) {
+ if (arc_no_grow) {
ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
spa_config_exit(spa, SCL_L2ARC, dev);
continue;
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
+ list_link_init(&adddev->l2ad_node);
ASSERT3U(adddev->l2ad_write, >, 0);
/*
cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
mutex_exit(&l2arc_feed_thr_lock);
}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(arc_read);
+EXPORT_SYMBOL(arc_buf_remove_ref);
+EXPORT_SYMBOL(arc_getbuf_func);
+EXPORT_SYMBOL(arc_add_prune_callback);
+EXPORT_SYMBOL(arc_remove_prune_callback);
+
+module_param(zfs_arc_min, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
+
+module_param(zfs_arc_max, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
+
+module_param(zfs_arc_meta_limit, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
+
+module_param(zfs_arc_meta_prune, int, 0444);
+MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
+
+module_param(zfs_arc_grow_retry, int, 0444);
+MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
+
+module_param(zfs_arc_shrink_shift, int, 0444);
+MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
+
+module_param(zfs_arc_p_min_shift, int, 0444);
+MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
+
+module_param(l2arc_write_max, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
+
+module_param(l2arc_write_boost, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
+
+module_param(l2arc_headroom, ulong, 0444);
+MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
+
+module_param(l2arc_feed_secs, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
+
+module_param(l2arc_feed_min_ms, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
+
+module_param(l2arc_noprefetch, int, 0444);
+MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
+
+module_param(l2arc_feed_again, int, 0444);
+MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
+
+module_param(l2arc_norw, int, 0444);
+MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
+
+#endif