*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
*/
/*
* types of locks: 1) the hash table lock array, and 2) the
* arc list locks.
*
- * Buffers do not have their own mutexs, rather they rely on the
- * hash table mutexs for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexs).
+ * Buffers do not have their own mutexes, rather they rely on the
+ * hash table mutexes for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexes).
*
* buf_hash_find() returns the appropriate mutex (held) when it
* locates the requested buffer in the hash table. It returns
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
+#include <sys/dmu_tx.h>
#include <zfs_fletcher.h>
static kmutex_t arc_reclaim_thr_lock;
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
static uint8_t arc_thread_exit;
-extern int zfs_write_limit_shift;
-extern uint64_t zfs_write_limit_max;
-extern kmutex_t zfs_write_limit_lock;
-
/* number of bytes to prune from caches when at arc_meta_limit is reached */
uint_t arc_meta_prune = 1048576;
} arc_reclaim_strategy_t;
/* number of seconds before growing cache again */
-static int arc_grow_retry = 60;
+static int arc_grow_retry = 5;
+
+/* expiration time for arc_no_grow */
+static clock_t arc_grow_time = 0;
/* shift of arc_c for calculating both min and max arc_p */
static int arc_p_min_shift = 4;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
+int zfs_arc_memory_throttle_disable = 1;
+int zfs_disable_dup_eviction = 0;
int zfs_arc_meta_prune = 0;
/*
kstat_named_t arcstat_l2_size;
kstat_named_t arcstat_l2_hdr_size;
kstat_named_t arcstat_memory_throttle_count;
+ kstat_named_t arcstat_duplicate_buffers;
+ kstat_named_t arcstat_duplicate_buffers_size;
+ kstat_named_t arcstat_duplicate_reads;
kstat_named_t arcstat_memory_direct_count;
kstat_named_t arcstat_memory_indirect_count;
kstat_named_t arcstat_no_grow;
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
+ { "duplicate_buffers", KSTAT_DATA_UINT64 },
+ { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
+ { "duplicate_reads", KSTAT_DATA_UINT64 },
{ "memory_direct_count", KSTAT_DATA_UINT64 },
{ "memory_indirect_count", KSTAT_DATA_UINT64 },
{ "arc_no_grow", KSTAT_DATA_UINT64 },
bzero(buf, sizeof (arc_buf_t));
mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
- rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
return (0);
arc_buf_t *buf = vbuf;
mutex_destroy(&buf->b_evict_lock);
- rw_destroy(&buf->b_data_lock);
arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
}
-/*
- * Reclaim callback -- invoked when memory is low.
- */
-/* ARGSUSED */
-static void
-hdr_recl(void *unused)
-{
- /*
- * umem calls the reclaim func when we destroy the buf cache,
- * which is after we do arc_fini().
- */
- if (!arc_dead)
- cv_signal(&arc_reclaim_thr_cv);
-}
-
static void
buf_init(void)
{
}
hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
- 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+ 0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
mutex_exit(&buf->b_hdr->b_freeze_lock);
return;
}
- buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+ buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+ KM_PUSHPAGE);
fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
buf->b_hdr->b_freeze_cksum);
mutex_exit(&buf->b_hdr->b_freeze_lock);
ASSERT(list_link_active(&ab->b_arc_node));
list_remove(list, ab);
if (GHOST_STATE(ab->b_state)) {
- ASSERT3U(ab->b_datacnt, ==, 0);
+ ASSERT0(ab->b_datacnt);
ASSERT3P(ab->b_buf, ==, NULL);
delta = ab->b_size;
}
ASSERT(BUF_EMPTY(hdr));
hdr->b_size = size;
hdr->b_type = type;
- hdr->b_spa = spa_guid(spa);
+ hdr->b_spa = spa_load_guid(spa);
hdr->b_state = arc_anon;
hdr->b_arc_access = 0;
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
hdr->b_buf = buf;
arc_get_data_buf(buf);
bcopy(from->b_data, buf->b_data, size);
+
+ /*
+ * This buffer already exists in the arc so create a duplicate
+ * copy for the caller. If the buffer is associated with user data
+ * then track the size and number of duplicates. These stats will be
+ * updated as duplicate buffers are created and destroyed.
+ */
+ if (hdr->b_type == ARC_BUFC_DATA) {
+ ARCSTAT_BUMP(arcstat_duplicate_buffers);
+ ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
+ }
hdr->b_datacnt += 1;
return (buf);
}
{
if (HDR_L2_WRITING(hdr)) {
l2arc_data_free_t *df;
- df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+ df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE);
df->l2df_data = data;
df->l2df_size = size;
df->l2df_func = free_func;
ASSERT3U(state->arcs_size, >=, size);
atomic_add_64(&state->arcs_size, -size);
buf->b_data = NULL;
+
+ /*
+ * If we're destroying a duplicate buffer make sure
+ * that the appropriate statistics are updated.
+ */
+ if (buf->b_hdr->b_datacnt > 1 &&
+ buf->b_hdr->b_type == ARC_BUFC_DATA) {
+ ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+ ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
+ }
ASSERT(buf->b_hdr->b_datacnt > 0);
buf->b_hdr->b_datacnt -= 1;
}
}
/*
+ * Called from the DMU to determine if the current buffer should be
+ * evicted. In order to ensure proper locking, the eviction must be initiated
+ * from the DMU. Return true if the buffer is associated with user data and
+ * duplicate buffers still exist.
+ */
+boolean_t
+arc_buf_eviction_needed(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr;
+ boolean_t evict_needed = B_FALSE;
+
+ if (zfs_disable_dup_eviction)
+ return (B_FALSE);
+
+ mutex_enter(&buf->b_evict_lock);
+ hdr = buf->b_hdr;
+ if (hdr == NULL) {
+ /*
+ * We are in arc_do_user_evicts(); let that function
+ * perform the eviction.
+ */
+ ASSERT(buf->b_data == NULL);
+ mutex_exit(&buf->b_evict_lock);
+ return (B_FALSE);
+ } else if (buf->b_data == NULL) {
+ /*
+ * We have already been added to the arc eviction list;
+ * recommend eviction.
+ */
+ ASSERT3P(hdr, ==, &arc_eviction_hdr);
+ mutex_exit(&buf->b_evict_lock);
+ return (B_TRUE);
+ }
+
+ if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
+ evict_needed = B_TRUE;
+
+ mutex_exit(&buf->b_evict_lock);
+ return (evict_needed);
+}
+
+/*
* Evict buffers from list until we've removed the specified number of
* bytes. Move the removed buffers to the appropriate evict state.
* If the recycle flag is set, then attempt to "recycle" a buffer:
hash_lock = HDR_LOCK(ab);
have_lock = MUTEX_HELD(hash_lock);
if (have_lock || mutex_tryenter(hash_lock)) {
- ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+ ASSERT0(refcount_count(&ab->b_refcnt));
ASSERT(ab->b_datacnt > 0);
while (ab->b_buf) {
arc_buf_t *buf = ab->b_buf;
uint64_t guid = 0;
if (spa)
- guid = spa_guid(spa);
+ guid = spa_load_guid(spa);
while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
}
void
-arc_shrink(void)
+arc_shrink(uint64_t bytes)
{
if (arc_c > arc_c_min) {
uint64_t to_free;
-#ifdef _KERNEL
- to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
-#else
- to_free = arc_c >> arc_shrink_shift;
-#endif
+ to_free = bytes ? bytes : arc_c >> arc_shrink_shift;
+
if (arc_c > arc_c_min + to_free)
atomic_add_64(&arc_c, -to_free);
else
arc_adjust();
}
-static int
-arc_reclaim_needed(void)
-{
-#ifdef _KERNEL
- uint64_t extra;
-
- if (needfree)
- return (1);
-
- /*
- * take 'desfree' extra pages, so we reclaim sooner, rather than later
- */
- extra = desfree;
-
- /*
- * check that we're out of range of the pageout scanner. It starts to
- * schedule paging if freemem is less than lotsfree and needfree.
- * lotsfree is the high-water mark for pageout, and needfree is the
- * number of needed free pages. We add extra pages here to make sure
- * the scanner doesn't start up while we're freeing memory.
- */
- if (freemem < lotsfree + needfree + extra)
- return (1);
-
- /*
- * check to make sure that swapfs has enough space so that anon
- * reservations can still succeed. anon_resvmem() checks that the
- * availrmem is greater than swapfs_minfree, and the number of reserved
- * swap pages. We also add a bit of extra here just to prevent
- * circumstances from getting really dire.
- */
- if (availrmem < swapfs_minfree + swapfs_reserve + extra)
- return (1);
-
-#if defined(__i386)
- /*
- * If we're on an i386 platform, it's possible that we'll exhaust the
- * kernel heap space before we ever run out of available physical
- * memory. Most checks of the size of the heap_area compare against
- * tune.t_minarmem, which is the minimum available real memory that we
- * can have in the system. However, this is generally fixed at 25 pages
- * which is so low that it's useless. In this comparison, we seek to
- * calculate the total heap-size, and reclaim if more than 3/4ths of the
- * heap is allocated. (Or, in the calculation, if less than 1/4th is
- * free)
- */
- if (btop(vmem_size(heap_arena, VMEM_FREE)) <
- (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
- return (1);
-#endif
-
-#else
- if (spa_get_random(100) == 0)
- return (1);
-#endif
- return (0);
-}
-
static void
-arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
{
size_t i;
kmem_cache_t *prev_cache = NULL;
* reap free buffers from the arc kmem caches.
*/
if (strat == ARC_RECLAIM_AGGR)
- arc_shrink();
+ arc_shrink(bytes);
for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
if (zio_buf_cache[i] != prev_cache) {
kmem_cache_reap_now(hdr_cache);
}
+/*
+ * Unlike other ZFS implementations this thread is only responsible for
+ * adapting the target ARC size on Linux. The responsibility for memory
+ * reclamation has been entirely delegated to the arc_shrinker_func()
+ * which is registered with the VM. To reflect this change in behavior
+ * the arc_reclaim thread has been renamed to arc_adapt.
+ */
static void
-arc_reclaim_thread(void)
+arc_adapt_thread(void)
{
- clock_t growtime = 0;
- arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
callb_cpr_t cpr;
int64_t prune;
mutex_enter(&arc_reclaim_thr_lock);
while (arc_thread_exit == 0) {
- if (arc_reclaim_needed()) {
+#ifndef _KERNEL
+ arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
+
+ if (spa_get_random(100) == 0) {
if (arc_no_grow) {
if (last_reclaim == ARC_RECLAIM_CONS) {
}
/* reset the growth delay for every reclaim */
- growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
+ arc_grow_time = ddi_get_lbolt()+(arc_grow_retry * hz);
- arc_kmem_reap_now(last_reclaim);
+ arc_kmem_reap_now(last_reclaim, 0);
arc_warm = B_TRUE;
+ }
+#endif /* !_KERNEL */
- } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
+ /* No recent memory pressure allow the ARC to grow. */
+ if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
arc_no_grow = FALSE;
- }
/*
* Keep meta data usage within limits, arc_shrink() is not
#ifdef _KERNEL
/*
- * Under Linux the arc shrinker may be called for synchronous (direct)
- * reclaim, or asynchronous (indirect) reclaim. When called by kswapd
- * for indirect reclaim we take a conservative approach and just reap
- * free slabs from the ARC caches. If this proves to be insufficient
- * direct reclaim will be trigger. In direct reclaim a more aggressive
- * strategy is used, data is evicted from the ARC and free slabs reaped.
+ * Determine the amount of memory eligible for eviction contained in the
+ * ARC. All clean data reported by the ghost lists can always be safely
+ * evicted. Due to arc_c_min, the same does not hold for all clean data
+ * contained by the regular mru and mfu lists.
+ *
+ * In the case of the regular mru and mfu lists, we need to report as
+ * much clean data as possible, such that evicting that same reported
+ * data will not bring arc_size below arc_c_min. Thus, in certain
+ * circumstances, the total amount of clean data in the mru and mfu
+ * lists might not actually be evictable.
+ *
+ * The following two distinct cases are accounted for:
+ *
+ * 1. The sum of the amount of dirty data contained by both the mru and
+ * mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ * is greater than or equal to arc_c_min.
+ * (i.e. amount of dirty data >= arc_c_min)
+ *
+ * This is the easy case; all clean data contained by the mru and mfu
+ * lists is evictable. Evicting all clean data can only drop arc_size
+ * to the amount of dirty data, which is greater than arc_c_min.
+ *
+ * 2. The sum of the amount of dirty data contained by both the mru and
+ * mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ * is less than arc_c_min.
+ * (i.e. arc_c_min > amount of dirty data)
+ *
+ * 2.1. arc_size is greater than or equal arc_c_min.
+ * (i.e. arc_size >= arc_c_min > amount of dirty data)
+ *
+ * In this case, not all clean data from the regular mru and mfu
+ * lists is actually evictable; we must leave enough clean data
+ * to keep arc_size above arc_c_min. Thus, the maximum amount of
+ * evictable data from the two lists combined, is exactly the
+ * difference between arc_size and arc_c_min.
+ *
+ * 2.2. arc_size is less than arc_c_min
+ * (i.e. arc_c_min > arc_size > amount of dirty data)
+ *
+ * In this case, none of the data contained in the mru and mfu
+ * lists is evictable, even if it's clean. Since arc_size is
+ * already below arc_c_min, evicting any more would only
+ * increase this negative difference.
*/
+static uint64_t
+arc_evictable_memory(void) {
+ uint64_t arc_clean =
+ arc_mru->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
+ uint64_t ghost_clean =
+ arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
+ arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
+ arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
+ uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
+
+ if (arc_dirty >= arc_c_min)
+ return (ghost_clean + arc_clean);
+
+ return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
+}
+
static int
__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
{
- arc_reclaim_strategy_t strategy;
- int arc_reclaim;
+ uint64_t pages;
- /* Return number of reclaimable pages based on arc_shrink_shift */
- arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
- >> arc_shrink_shift, 0);
- if (sc->nr_to_scan == 0)
- return (arc_reclaim);
+ /* The arc is considered warm once reclaim has occurred */
+ if (unlikely(arc_warm == B_FALSE))
+ arc_warm = B_TRUE;
- /* Prevent reclaim below arc_c_min */
- if (arc_reclaim <= 0)
- return (-1);
+ /* Return the potential number of reclaimable pages */
+ pages = btop(arc_evictable_memory());
+ if (sc->nr_to_scan == 0)
+ return (pages);
/* Not allowed to perform filesystem reclaim */
if (!(sc->gfp_mask & __GFP_FS))
if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
return (-1);
+ /*
+ * Evict the requested number of pages by shrinking arc_c the
+ * requested amount. If there is nothing left to evict just
+ * reap whatever we can from the various arc slabs.
+ */
+ if (pages > 0) {
+ arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
+ pages = btop(arc_evictable_memory());
+ } else {
+ arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
+ pages = -1;
+ }
+
+ /*
+ * When direct reclaim is observed it usually indicates a rapid
+ * increase in memory pressure. This occurs because the kswapd
+ * threads were unable to asynchronously keep enough free memory
+ * available. In this case set arc_no_grow to briefly pause arc
+ * growth to avoid compounding the memory pressure.
+ */
if (current_is_kswapd()) {
- strategy = ARC_RECLAIM_CONS;
- ARCSTAT_INCR(arcstat_memory_indirect_count, 1);
+ ARCSTAT_BUMP(arcstat_memory_indirect_count);
} else {
- strategy = ARC_RECLAIM_AGGR;
- ARCSTAT_INCR(arcstat_memory_direct_count, 1);
+ arc_no_grow = B_TRUE;
+ arc_grow_time = ddi_get_lbolt() + (arc_grow_retry * hz);
+ ARCSTAT_BUMP(arcstat_memory_direct_count);
}
- arc_kmem_reap_now(strategy);
- arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
- >> arc_shrink_shift, 0);
mutex_exit(&arc_reclaim_thr_lock);
- return (arc_reclaim);
+ return (pages);
}
SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
}
ASSERT((int64_t)arc_p >= 0);
- if (arc_reclaim_needed()) {
- cv_signal(&arc_reclaim_thr_cv);
- return;
- }
-
if (arc_no_grow)
return;
if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
return (1);
-#ifdef _KERNEL
- /*
- * If zio data pages are being allocated out of a separate heap segment,
- * then enforce that the size of available vmem for this area remains
- * above about 1/32nd free.
- */
- if (type == ARC_BUFC_DATA && zio_arena != NULL &&
- vmem_size(zio_arena, VMEM_FREE) <
- (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
- return (1);
-#endif
-
- if (arc_reclaim_needed())
+ if (arc_no_grow)
return (1);
return (arc_size > arc_c);
* This is a prefetch access...
* move this block back to the MRU state.
*/
- ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
+ ASSERT0(refcount_count(&buf->b_refcnt));
new_state = arc_mru;
}
callback_list = hdr->b_acb;
ASSERT(callback_list != NULL);
if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
- arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
- byteswap_uint64_array :
- dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
- func(buf->b_data, hdr->b_size);
+ dmu_object_byteswap_t bswap =
+ DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
+ if (BP_GET_LEVEL(zio->io_bp) > 0)
+ byteswap_uint64_array(buf->b_data, hdr->b_size);
+ else
+ dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
}
arc_cksum_compute(buf, B_FALSE);
abuf = buf;
for (acb = callback_list; acb; acb = acb->acb_next) {
if (acb->acb_done) {
- if (abuf == NULL)
+ if (abuf == NULL) {
+ ARCSTAT_BUMP(arcstat_duplicate_reads);
abuf = arc_buf_clone(buf);
+ }
acb->acb_buf = abuf;
abuf = NULL;
}
}
/*
- * "Read" the block block at the specified DVA (in bp) via the
+ * "Read" the block at the specified DVA (in bp) via the
* cache. If the block is found in the cache, invoke the provided
* callback immediately and return. Note that the `zio' parameter
* in the callback will be NULL in this case, since no IO was
*
* arc_read_done() will invoke all the requested "done" functions
* for readers of this block.
- *
- * Normal callers should use arc_read and pass the arc buffer and offset
- * for the bp. But if you know you don't need locking, you can use
- * arc_read_bp.
*/
int
-arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
- arc_done_func_t *done, void *private, int priority, int zio_flags,
- uint32_t *arc_flags, const zbookmark_t *zb)
-{
- int err;
-
- if (pbuf == NULL) {
- /*
- * XXX This happens from traverse callback funcs, for
- * the objset_phys_t block.
- */
- return (arc_read_nolock(pio, spa, bp, done, private, priority,
- zio_flags, arc_flags, zb));
- }
-
- ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
- ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
- rw_enter(&pbuf->b_data_lock, RW_READER);
-
- err = arc_read_nolock(pio, spa, bp, done, private, priority,
- zio_flags, arc_flags, zb);
- rw_exit(&pbuf->b_data_lock);
-
- return (err);
-}
-
-int
-arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
- arc_done_func_t *done, void *private, int priority, int zio_flags,
- uint32_t *arc_flags, const zbookmark_t *zb)
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
+ void *private, int priority, int zio_flags, uint32_t *arc_flags,
+ const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
arc_buf_t *buf = NULL;
kmutex_t *hash_lock;
zio_t *rzio;
- uint64_t guid = spa_guid(spa);
+ uint64_t guid = spa_load_guid(spa);
top:
hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
/* this block is in the ghost cache */
ASSERT(GHOST_STATE(hdr->b_state));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
+ ASSERT0(refcount_count(&hdr->b_refcnt));
ASSERT(hdr->b_buf == NULL);
/* if this is a prefetch, we don't have a reference */
}
/*
+ * Notify the arc that a block was freed, and thus will never be used again.
+ */
+void
+arc_freed(spa_t *spa, const blkptr_t *bp)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ uint64_t guid = spa_load_guid(spa);
+
+ hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+ &hash_lock);
+ if (hdr == NULL)
+ return;
+ if (HDR_BUF_AVAILABLE(hdr)) {
+ arc_buf_t *buf = hdr->b_buf;
+ add_reference(hdr, hash_lock, FTAG);
+ hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+ mutex_exit(hash_lock);
+
+ arc_release(buf, FTAG);
+ (void) arc_buf_remove_ref(buf, FTAG);
+ } else {
+ mutex_exit(hash_lock);
+ }
+
+}
+
+/*
* This is used by the DMU to let the ARC know that a buffer is
* being evicted, so the ARC should clean up. If this arc buf
* is not yet in the evicted state, it will be put there.
ASSERT3U(*size, >=, hdr->b_size);
atomic_add_64(size, -hdr->b_size);
}
+
+ /*
+ * We're releasing a duplicate user data buffer, update
+ * our statistics accordingly.
+ */
+ if (hdr->b_type == ARC_BUFC_DATA) {
+ ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+ ARCSTAT_INCR(arcstat_duplicate_buffers_size,
+ -hdr->b_size);
+ }
hdr->b_datacnt -= 1;
arc_cksum_verify(buf);
}
}
-/*
- * Release this buffer. If it does not match the provided BP, fill it
- * with that block's contents.
- */
-/* ARGSUSED */
-int
-arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
- zbookmark_t *zb)
-{
- arc_release(buf, tag);
- return (0);
-}
-
int
arc_released(arc_buf_t *buf)
{
ASSERT(hdr->b_acb == NULL);
if (l2arc)
hdr->b_flags |= ARC_L2CACHE;
- callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+ callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE);
callback->awcb_ready = ready;
callback->awcb_done = done;
callback->awcb_private = private;
arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
{
#ifdef _KERNEL
- uint64_t available_memory = ptob(freemem);
- static uint64_t page_load = 0;
- static uint64_t last_txg = 0;
+ uint64_t available_memory;
-#if defined(__i386)
- available_memory =
- MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
-#endif
- if (available_memory >= zfs_write_limit_max)
+ if (zfs_arc_memory_throttle_disable)
return (0);
- if (txg > last_txg) {
- last_txg = txg;
- page_load = 0;
- }
- /*
- * If we are in pageout, we know that memory is already tight,
- * the arc is already going to be evicting, so we just want to
- * continue to let page writes occur as quickly as possible.
- */
- if (curproc == proc_pageout) {
- if (page_load > MAX(ptob(minfree), available_memory) / 4)
- return (ERESTART);
- /* Note: reserve is inflated, so we deflate */
- page_load += reserve / 8;
- return (0);
- } else if (page_load > 0 && arc_reclaim_needed()) {
- /* memory is low, delay before restarting */
+ /* Easily reclaimable memory (free + inactive + arc-evictable) */
+ available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();
+
+ if (available_memory <= zfs_write_limit_max) {
ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
return (EAGAIN);
}
- page_load = 0;
-
- if (arc_size > arc_c_min) {
- uint64_t evictable_memory =
- arc_mru->arcs_lsize[ARC_BUFC_DATA] +
- arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
- arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
- arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
- available_memory += MIN(evictable_memory, arc_size - arc_c_min);
- }
if (inflight_data > available_memory / 4) {
ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+ DMU_TX_STAT_BUMP(dmu_tx_memory_inflight);
return (ERESTART);
}
#endif
#endif
if (reserve > arc_c/4 && !arc_no_grow)
arc_c = MIN(arc_c_max, reserve * 4);
- if (reserve > arc_c)
+ if (reserve > arc_c) {
+ DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
return (ENOMEM);
+ }
/*
* Don't count loaned bufs as in flight dirty data to prevent long
arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
reserve>>10, arc_c>>10);
+ DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
return (ERESTART);
}
atomic_add_64(&arc_tempreserve, reserve);
&as->arcstat_mfu_size,
&as->arcstat_mfu_evict_data,
&as->arcstat_mfu_evict_metadata);
- arc_kstat_update_state(arc_mru_ghost,
+ arc_kstat_update_state(arc_mfu_ghost,
&as->arcstat_mfu_ghost_size,
&as->arcstat_mfu_ghost_evict_data,
&as->arcstat_mfu_ghost_evict_metadata);
/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
arc_c_min = MAX(arc_c / 4, 64<<20);
- /* set max to 1/2 of all memory, or all but 4GB, whichever is more */
- if (arc_c * 8 >= ((uint64_t)4<<30))
- arc_c_max = (arc_c * 8) - ((uint64_t)4<<30);
- else
- arc_c_max = arc_c_min;
+ /* set max to 1/2 of all memory */
arc_c_max = MAX(arc_c * 4, arc_c_max);
/*
kstat_install(arc_ksp);
}
- (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+ (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
TS_RUN, minclsyspri);
arc_dead = FALSE;
boolean_t have_lock, full;
l2arc_write_callback_t *cb;
zio_t *pio, *wzio;
- uint64_t guid = spa_guid(spa);
+ uint64_t guid = spa_load_guid(spa);
int try;
ASSERT(dev->l2ad_vdev != NULL);
*/
list_insert_head(dev->l2ad_buflist, head);
- cb = kmem_alloc(
- sizeof (l2arc_write_callback_t), KM_SLEEP);
+ cb = kmem_alloc(sizeof (l2arc_write_callback_t),
+ KM_PUSHPAGE);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
pio = zio_root(spa, l2arc_write_done, cb,
/*
* Create and add a new L2ARC header.
*/
- hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+ hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
+ KM_PUSHPAGE);
hdrl2->b_dev = dev;
hdrl2->b_daddr = dev->l2ad_hand;
mutex_exit(&l2arc_buflist_mtx);
if (pio == NULL) {
- ASSERT3U(write_sz, ==, 0);
+ ASSERT0(write_sz);
kmem_cache_free(hdr_cache, head);
return (0);
}
/*
* Avoid contributing to memory pressure.
*/
- if (arc_reclaim_needed()) {
+ if (arc_no_grow) {
ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
spa_config_exit(spa, SCL_L2ARC, dev);
continue;
module_param(zfs_arc_p_min_shift, int, 0444);
MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
+module_param(zfs_disable_dup_eviction, int, 0644);
+MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
+
+module_param(zfs_arc_memory_throttle_disable, int, 0644);
+MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
+
module_param(l2arc_write_max, ulong, 0444);
MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");