X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Farc.c;h=ce4a0239c0c0ba18370fd9ea43cb2b176fb90e05;hb=68121a03daf58a7d5b9351f110196b8ce806e1fa;hp=cbc27d3687a936ffdc7e4bd3aa23fd68db862e8f;hpb=409dc1a570a836737b2a5bb43658cdde703c935e;p=zfs.git diff --git a/module/zfs/arc.c b/module/zfs/arc.c index cbc27d3..ce4a023 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -20,6 +20,9 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ /* @@ -78,9 +81,9 @@ * types of locks: 1) the hash table lock array, and 2) the * arc list locks. * - * Buffers do not have their own mutexs, rather they rely on the - * hash table mutexs for the bulk of their protection (i.e. most - * fields in the arc_buf_hdr_t are protected by these mutexs). + * Buffers do not have their own mutexes, rather they rely on the + * hash table mutexes for the bulk of their protection (i.e. most + * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns @@ -126,6 +129,7 @@ #include #include +#include #include #include #include @@ -145,12 +149,8 @@ static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; -extern int zfs_write_limit_shift; -extern uint64_t zfs_write_limit_max; -extern kmutex_t zfs_write_limit_lock; - /* number of bytes to prune from caches when at arc_meta_limit is reached */ -uint_t arc_meta_prune = 1048576; +int zfs_arc_meta_prune = 1048576; typedef enum arc_reclaim_strategy { ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ @@ -158,22 +158,31 @@ typedef enum arc_reclaim_strategy { } arc_reclaim_strategy_t; /* number of seconds before growing cache again */ -static int arc_grow_retry = 60; +int zfs_arc_grow_retry = 5; /* shift of arc_c for calculating both min and max arc_p */ -static int arc_p_min_shift = 4; +int zfs_arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ -static int arc_shrink_shift = 5; +int zfs_arc_shrink_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ -static int arc_min_prefetch_lifespan; +int zfs_arc_min_prefetch_lifespan = HZ; + +/* disable arc proactive arc throttle due to low memory */ +int zfs_arc_memory_throttle_disable = 1; + +/* disable duplicate buffer eviction */ +int zfs_disable_dup_eviction = 0; static int arc_dead; +/* expiration time for arc_no_grow */ +static clock_t arc_grow_time = 0; + /* * The arc has filled available memory and has now warmed up. */ @@ -185,10 +194,6 @@ static boolean_t arc_warm; unsigned long zfs_arc_max = 0; unsigned long zfs_arc_min = 0; unsigned long zfs_arc_meta_limit = 0; -int zfs_arc_grow_retry = 0; -int zfs_arc_shrink_shift = 0; -int zfs_arc_p_min_shift = 0; -int zfs_arc_meta_prune = 0; /* * Note that buffers can be in one of 6 states: @@ -304,8 +309,15 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; + kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; + kstat_named_t arcstat_l2_compress_successes; + kstat_named_t arcstat_l2_compress_zeros; + kstat_named_t arcstat_l2_compress_failures; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_duplicate_buffers; + kstat_named_t arcstat_duplicate_buffers_size; + kstat_named_t arcstat_duplicate_reads; kstat_named_t arcstat_memory_direct_count; kstat_named_t arcstat_memory_indirect_count; kstat_named_t arcstat_no_grow; @@ -384,8 +396,15 @@ static arc_stats_t arc_stats = { { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, + { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "l2_compress_successes", KSTAT_DATA_UINT64 }, + { "l2_compress_zeros", KSTAT_DATA_UINT64 }, + { "l2_compress_failures", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "duplicate_buffers", KSTAT_DATA_UINT64 }, + { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, + { "duplicate_reads", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, @@ -463,6 +482,9 @@ static arc_state_t *arc_l2c_only; #define arc_meta_limit ARCSTAT(arcstat_meta_limit) #define arc_meta_max ARCSTAT(arcstat_meta_max) +#define L2ARC_IS_VALID_COMPRESS(_c_) \ + ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) + typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; typedef struct arc_callback arc_callback_t; @@ -492,7 +514,6 @@ struct arc_buf_hdr { kmutex_t b_freeze_lock; zio_cksum_t *b_freeze_cksum; - void *b_thawed; arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; @@ -529,7 +550,8 @@ static arc_buf_hdr_t arc_eviction_hdr; static void arc_get_data_buf(arc_buf_t *buf); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); static int arc_evict_needed(arc_buf_contents_t type); -static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); +static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type); static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); @@ -614,7 +636,12 @@ uint64_t zfs_crc64_table[256]; */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ -#define L2ARC_HEADROOM 2 /* num of writes */ +#define L2ARC_HEADROOM 2 /* num of writes */ +/* + * If we discover during ARC scan any buffers to be compressed, we boost + * our headroom for the next scanning cycle by this percentage multiple. + */ +#define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ @@ -627,11 +654,13 @@ uint64_t zfs_crc64_table[256]; unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ +unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +int l2arc_nocompress = B_FALSE; /* don't compress bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ -int l2arc_norw = B_TRUE; /* no reads during writes */ +int l2arc_norw = B_FALSE; /* no reads during writes */ /* * L2ARC Internals @@ -640,8 +669,6 @@ typedef struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ spa_t *l2ad_spa; /* spa */ uint64_t l2ad_hand; /* next write location */ - uint64_t l2ad_write; /* desired write size, bytes */ - uint64_t l2ad_boost; /* warmup write boost, bytes */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ uint64_t l2ad_evict; /* last addr eviction reached */ @@ -662,11 +689,12 @@ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { - arc_buf_t *l2rcb_buf; /* read buffer */ - spa_t *l2rcb_spa; /* spa */ - blkptr_t l2rcb_bp; /* original blkptr */ - zbookmark_t l2rcb_zb; /* original bookmark */ - int l2rcb_flags; /* original flags */ + arc_buf_t *l2rcb_buf; /* read buffer */ + spa_t *l2rcb_spa; /* spa */ + blkptr_t l2rcb_bp; /* original blkptr */ + zbookmark_t l2rcb_zb; /* original bookmark */ + int l2rcb_flags; /* original flags */ + enum zio_compress l2rcb_compress; /* applied compress */ } l2arc_read_callback_t; typedef struct l2arc_write_callback { @@ -676,8 +704,14 @@ typedef struct l2arc_write_callback { struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ - l2arc_dev_t *b_dev; /* L2ARC device */ - uint64_t b_daddr; /* disk address, offset byte */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + /* compression applied to buffer data */ + enum zio_compress b_compress; + /* real alloc'd buffer size depending on b_compress applied */ + int b_asize; + /* temporary buffer holder for in-flight compressed data */ + void *b_tmp_cdata; }; typedef struct l2arc_data_free { @@ -696,6 +730,11 @@ static void l2arc_read_done(zio_t *zio); static void l2arc_hdr_stat_add(void); static void l2arc_hdr_stat_remove(void); +static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); +static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, + enum zio_compress c); +static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); + static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { @@ -875,7 +914,6 @@ buf_cons(void *vbuf, void *unused, int kmflag) bzero(buf, sizeof (arc_buf_t)); mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); @@ -905,25 +943,9 @@ buf_dest(void *vbuf, void *unused) arc_buf_t *buf = vbuf; mutex_destroy(&buf->b_evict_lock); - rw_destroy(&buf->b_data_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } -/* - * Reclaim callback -- invoked when memory is low. - */ -/* ARGSUSED */ -static void -hdr_recl(void *unused) -{ - /* - * umem calls the reclaim func when we destroy the buf cache, - * which is after we do arc_fini(). - */ - if (!arc_dead) - cv_signal(&arc_reclaim_thr_cv); -} - static void buf_init(void) { @@ -956,7 +978,7 @@ retry: } hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), - 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); + 0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); @@ -1041,12 +1063,6 @@ arc_buf_thaw(arc_buf_t *buf) buf->b_hdr->b_freeze_cksum = NULL; } - if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_thawed) - kmem_free(buf->b_hdr->b_thawed, 1); - buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); - } - mutex_exit(&buf->b_hdr->b_freeze_lock); } @@ -1083,7 +1099,7 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) ASSERT(list_link_active(&ab->b_arc_node)); list_remove(list, ab); if (GHOST_STATE(ab->b_state)) { - ASSERT3U(ab->b_datacnt, ==, 0); + ASSERT0(ab->b_datacnt); ASSERT3P(ab->b_buf, ==, NULL); delta = ab->b_size; } @@ -1268,23 +1284,6 @@ arc_space_return(uint64_t space, arc_space_type_t type) atomic_add_64(&arc_size, -space); } -void * -arc_data_buf_alloc(uint64_t size) -{ - if (arc_evict_needed(ARC_BUFC_DATA)) - cv_signal(&arc_reclaim_thr_cv); - atomic_add_64(&arc_size, size); - return (zio_data_buf_alloc(size)); -} - -void -arc_data_buf_free(void *buf, uint64_t size) -{ - zio_data_buf_free(buf, size); - ASSERT(arc_size >= size); - atomic_add_64(&arc_size, -size); -} - arc_buf_t * arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) { @@ -1296,7 +1295,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; - hdr->b_spa = spa_guid(spa); + hdr->b_spa = spa_load_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); @@ -1383,6 +1382,17 @@ arc_buf_clone(arc_buf_t *from) hdr->b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); + + /* + * This buffer already exists in the arc so create a duplicate + * copy for the caller. If the buffer is associated with user data + * then track the size and number of duplicates. These stats will be + * updated as duplicate buffers are created and destroyed. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMP(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); + } hdr->b_datacnt += 1; return (buf); } @@ -1430,7 +1440,7 @@ arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), { if (HDR_L2_WRITING(hdr)) { l2arc_data_free_t *df; - df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE); df->l2df_data = data; df->l2df_size = size; df->l2df_func = free_func; @@ -1481,6 +1491,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); buf->b_data = NULL; + + /* + * If we're destroying a duplicate buffer make sure + * that the appropriate statistics are updated. + */ + if (buf->b_hdr->b_datacnt > 1 && + buf->b_hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); + } ASSERT(buf->b_hdr->b_datacnt > 0); buf->b_hdr->b_datacnt -= 1; } @@ -1531,7 +1551,9 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) if (l2hdr != NULL) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); if (hdr->b_state == arc_l2c_only) l2arc_hdr_stat_remove(); hdr->b_l2hdr = NULL; @@ -1567,10 +1589,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } - if (hdr->b_thawed) { - kmem_free(hdr->b_thawed, 1); - hdr->b_thawed = NULL; - } ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -1629,7 +1647,7 @@ int arc_buf_remove_ref(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); + kmutex_t *hash_lock = NULL; int no_callback = (buf->b_efunc == NULL); if (hdr->b_state == arc_anon) { @@ -1638,6 +1656,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) return (no_callback); } + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); @@ -1666,6 +1685,48 @@ arc_buf_size(arc_buf_t *buf) } /* + * Called from the DMU to determine if the current buffer should be + * evicted. In order to ensure proper locking, the eviction must be initiated + * from the DMU. Return true if the buffer is associated with user data and + * duplicate buffers still exist. + */ +boolean_t +arc_buf_eviction_needed(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + boolean_t evict_needed = B_FALSE; + + if (zfs_disable_dup_eviction) + return (B_FALSE); + + mutex_enter(&buf->b_evict_lock); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(); let that function + * perform the eviction. + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&buf->b_evict_lock); + return (B_FALSE); + } else if (buf->b_data == NULL) { + /* + * We have already been added to the arc eviction list; + * recommend eviction. + */ + ASSERT3P(hdr, ==, &arc_eviction_hdr); + mutex_exit(&buf->b_evict_lock); + return (B_TRUE); + } + + if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) + evict_needed = B_TRUE; + + mutex_exit(&buf->b_evict_lock); + return (evict_needed); +} + +/* * Evict buffers from list until we've removed the specified number of * bytes. Move the removed buffers to the appropriate evict state. * If the recycle flag is set, then attempt to "recycle" a buffer: @@ -1704,7 +1765,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, (spa && ab->b_spa != spa) || (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && ddi_get_lbolt() - ab->b_arc_access < - arc_min_prefetch_lifespan)) { + zfs_arc_min_prefetch_lifespan)) { skipped++; continue; } @@ -1715,7 +1776,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, hash_lock = HDR_LOCK(ab); have_lock = MUTEX_HELD(hash_lock); if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); + ASSERT0(refcount_count(&ab->b_refcnt)); ASSERT(ab->b_datacnt > 0); while (ab->b_buf) { arc_buf_t *buf = ab->b_buf; @@ -1804,12 +1865,14 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { int64_t todelete = MIN(arc_mru_ghost->arcs_lsize[type], mru_over); - arc_evict_ghost(arc_mru_ghost, 0, todelete); + arc_evict_ghost(arc_mru_ghost, 0, todelete, + ARC_BUFC_DATA); } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c); - arc_evict_ghost(arc_mfu_ghost, 0, todelete); + arc_evict_ghost(arc_mfu_ghost, 0, todelete, + ARC_BUFC_DATA); } } @@ -1821,11 +1884,12 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, * bytes. Destroy the buffers that are removed. */ static void -arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) +arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) { arc_buf_hdr_t *ab, *ab_prev; arc_buf_hdr_t marker; - list_t *list = &state->arcs_list[ARC_BUFC_DATA]; + list_t *list = &state->arcs_list[type]; kmutex_t *hash_lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; @@ -1955,7 +2019,7 @@ arc_adjust(void) if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { delta = MIN(arc_mru_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mru_ghost, 0, delta); + arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_DATA); } adjustment = @@ -1963,7 +2027,7 @@ arc_adjust(void) if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { delta = MIN(arc_mfu_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mfu_ghost, 0, delta); + arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_DATA); } } @@ -2055,7 +2119,7 @@ arc_adjust_meta(int64_t adjustment, boolean_t may_prune) } if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit)) - arc_do_user_prune(arc_meta_prune); + arc_do_user_prune(zfs_arc_meta_prune); } /* @@ -2068,7 +2132,7 @@ arc_flush(spa_t *spa) uint64_t guid = 0; if (spa) - guid = spa_guid(spa); + guid = spa_load_guid(spa); while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); @@ -2091,8 +2155,8 @@ arc_flush(spa_t *spa) break; } - arc_evict_ghost(arc_mru_ghost, guid, -1); - arc_evict_ghost(arc_mfu_ghost, guid, -1); + arc_evict_ghost(arc_mru_ghost, guid, -1, ARC_BUFC_DATA); + arc_evict_ghost(arc_mfu_ghost, guid, -1, ARC_BUFC_DATA); mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); @@ -2101,22 +2165,19 @@ arc_flush(spa_t *spa) } void -arc_shrink(void) +arc_shrink(uint64_t bytes) { if (arc_c > arc_c_min) { uint64_t to_free; -#ifdef _KERNEL - to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); -#else - to_free = arc_c >> arc_shrink_shift; -#endif + to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift; + if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else arc_c = arc_c_min; - atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); + atomic_add_64(&arc_p, -(arc_p >> zfs_arc_shrink_shift)); if (arc_c > arc_size) arc_c = MAX(arc_size, arc_c_min); if (arc_p > arc_c) @@ -2129,66 +2190,8 @@ arc_shrink(void) arc_adjust(); } -static int -arc_reclaim_needed(void) -{ -#ifdef _KERNEL - uint64_t extra; - - if (needfree) - return (1); - - /* - * take 'desfree' extra pages, so we reclaim sooner, rather than later - */ - extra = desfree; - - /* - * check that we're out of range of the pageout scanner. It starts to - * schedule paging if freemem is less than lotsfree and needfree. - * lotsfree is the high-water mark for pageout, and needfree is the - * number of needed free pages. We add extra pages here to make sure - * the scanner doesn't start up while we're freeing memory. - */ - if (freemem < lotsfree + needfree + extra) - return (1); - - /* - * check to make sure that swapfs has enough space so that anon - * reservations can still succeed. anon_resvmem() checks that the - * availrmem is greater than swapfs_minfree, and the number of reserved - * swap pages. We also add a bit of extra here just to prevent - * circumstances from getting really dire. - */ - if (availrmem < swapfs_minfree + swapfs_reserve + extra) - return (1); - -#if defined(__i386) - /* - * If we're on an i386 platform, it's possible that we'll exhaust the - * kernel heap space before we ever run out of available physical - * memory. Most checks of the size of the heap_area compare against - * tune.t_minarmem, which is the minimum available real memory that we - * can have in the system. However, this is generally fixed at 25 pages - * which is so low that it's useless. In this comparison, we seek to - * calculate the total heap-size, and reclaim if more than 3/4ths of the - * heap is allocated. (Or, in the calculation, if less than 1/4th is - * free) - */ - if (btop(vmem_size(heap_arena, VMEM_FREE)) < - (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) - return (1); -#endif - -#else - if (spa_get_random(100) == 0) - return (1); -#endif - return (0); -} - static void -arc_kmem_reap_now(arc_reclaim_strategy_t strat) +arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) { size_t i; kmem_cache_t *prev_cache = NULL; @@ -2201,7 +2204,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) * reap free buffers from the arc kmem caches. */ if (strat == ARC_RECLAIM_AGGR) - arc_shrink(); + arc_shrink(bytes); for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { @@ -2218,11 +2221,16 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) kmem_cache_reap_now(hdr_cache); } +/* + * Unlike other ZFS implementations this thread is only responsible for + * adapting the target ARC size on Linux. The responsibility for memory + * reclamation has been entirely delegated to the arc_shrinker_func() + * which is registered with the VM. To reflect this change in behavior + * the arc_reclaim thread has been renamed to arc_adapt. + */ static void -arc_reclaim_thread(void) +arc_adapt_thread(void) { - clock_t growtime = 0; - arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; callb_cpr_t cpr; int64_t prune; @@ -2230,7 +2238,10 @@ arc_reclaim_thread(void) mutex_enter(&arc_reclaim_thr_lock); while (arc_thread_exit == 0) { - if (arc_reclaim_needed()) { +#ifndef _KERNEL + arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; + + if (spa_get_random(100) == 0) { if (arc_no_grow) { if (last_reclaim == ARC_RECLAIM_CONS) { @@ -2245,14 +2256,16 @@ arc_reclaim_thread(void) } /* reset the growth delay for every reclaim */ - growtime = ddi_get_lbolt() + (arc_grow_retry * hz); + arc_grow_time = ddi_get_lbolt()+(zfs_arc_grow_retry * hz); - arc_kmem_reap_now(last_reclaim); + arc_kmem_reap_now(last_reclaim, 0); arc_warm = B_TRUE; + } +#endif /* !_KERNEL */ - } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { + /* No recent memory pressure allow the ARC to grow. */ + if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time) arc_no_grow = FALSE; - } /* * Keep meta data usage within limits, arc_shrink() is not @@ -2273,6 +2286,26 @@ arc_reclaim_thread(void) (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); + + + /* Allow the module options to be changed */ + if (zfs_arc_max > 64 << 20 && + zfs_arc_max < physmem * PAGESIZE && + zfs_arc_max != arc_c_max) + arc_c_max = zfs_arc_max; + + if (zfs_arc_min > 0 && + zfs_arc_min < arc_c_max && + zfs_arc_min != arc_c_min) + arc_c_min = zfs_arc_min; + + if (zfs_arc_meta_limit > 0 && + zfs_arc_meta_limit <= arc_c_max && + zfs_arc_meta_limit != arc_meta_limit) + arc_meta_limit = zfs_arc_meta_limit; + + + } arc_thread_exit = 0; @@ -2283,28 +2316,83 @@ arc_reclaim_thread(void) #ifdef _KERNEL /* - * Under Linux the arc shrinker may be called for synchronous (direct) - * reclaim, or asynchronous (indirect) reclaim. When called by kswapd - * for indirect reclaim we take a conservative approach and just reap - * free slabs from the ARC caches. If this proves to be insufficient - * direct reclaim will be trigger. In direct reclaim a more aggressive - * strategy is used, data is evicted from the ARC and free slabs reaped. + * Determine the amount of memory eligible for eviction contained in the + * ARC. All clean data reported by the ghost lists can always be safely + * evicted. Due to arc_c_min, the same does not hold for all clean data + * contained by the regular mru and mfu lists. + * + * In the case of the regular mru and mfu lists, we need to report as + * much clean data as possible, such that evicting that same reported + * data will not bring arc_size below arc_c_min. Thus, in certain + * circumstances, the total amount of clean data in the mru and mfu + * lists might not actually be evictable. + * + * The following two distinct cases are accounted for: + * + * 1. The sum of the amount of dirty data contained by both the mru and + * mfu lists, plus the ARC's other accounting (e.g. the anon list), + * is greater than or equal to arc_c_min. + * (i.e. amount of dirty data >= arc_c_min) + * + * This is the easy case; all clean data contained by the mru and mfu + * lists is evictable. Evicting all clean data can only drop arc_size + * to the amount of dirty data, which is greater than arc_c_min. + * + * 2. The sum of the amount of dirty data contained by both the mru and + * mfu lists, plus the ARC's other accounting (e.g. the anon list), + * is less than arc_c_min. + * (i.e. arc_c_min > amount of dirty data) + * + * 2.1. arc_size is greater than or equal arc_c_min. + * (i.e. arc_size >= arc_c_min > amount of dirty data) + * + * In this case, not all clean data from the regular mru and mfu + * lists is actually evictable; we must leave enough clean data + * to keep arc_size above arc_c_min. Thus, the maximum amount of + * evictable data from the two lists combined, is exactly the + * difference between arc_size and arc_c_min. + * + * 2.2. arc_size is less than arc_c_min + * (i.e. arc_c_min > arc_size > amount of dirty data) + * + * In this case, none of the data contained in the mru and mfu + * lists is evictable, even if it's clean. Since arc_size is + * already below arc_c_min, evicting any more would only + * increase this negative difference. */ +static uint64_t +arc_evictable_memory(void) { + uint64_t arc_clean = + arc_mru->arcs_lsize[ARC_BUFC_DATA] + + arc_mru->arcs_lsize[ARC_BUFC_METADATA] + + arc_mfu->arcs_lsize[ARC_BUFC_DATA] + + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; + uint64_t ghost_clean = + arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] + + arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] + + arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] + + arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]; + uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0); + + if (arc_dirty >= arc_c_min) + return (ghost_clean + arc_clean); + + return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0)); +} + static int __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) { - arc_reclaim_strategy_t strategy; - int arc_reclaim; + uint64_t pages; - /* Return number of reclaimable pages based on arc_shrink_shift */ - arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min)) - >> arc_shrink_shift, 0); - if (sc->nr_to_scan == 0) - return (arc_reclaim); + /* The arc is considered warm once reclaim has occurred */ + if (unlikely(arc_warm == B_FALSE)) + arc_warm = B_TRUE; - /* Prevent reclaim below arc_c_min */ - if (arc_reclaim <= 0) - return (-1); + /* Return the potential number of reclaimable pages */ + pages = btop(arc_evictable_memory()); + if (sc->nr_to_scan == 0) + return (pages); /* Not allowed to perform filesystem reclaim */ if (!(sc->gfp_mask & __GFP_FS)) @@ -2314,20 +2402,35 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) if (mutex_tryenter(&arc_reclaim_thr_lock) == 0) return (-1); + /* + * Evict the requested number of pages by shrinking arc_c the + * requested amount. If there is nothing left to evict just + * reap whatever we can from the various arc slabs. + */ + if (pages > 0) { + arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan)); + } else { + arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan)); + } + + /* + * When direct reclaim is observed it usually indicates a rapid + * increase in memory pressure. This occurs because the kswapd + * threads were unable to asynchronously keep enough free memory + * available. In this case set arc_no_grow to briefly pause arc + * growth to avoid compounding the memory pressure. + */ if (current_is_kswapd()) { - strategy = ARC_RECLAIM_CONS; - ARCSTAT_INCR(arcstat_memory_indirect_count, 1); + ARCSTAT_BUMP(arcstat_memory_indirect_count); } else { - strategy = ARC_RECLAIM_AGGR; - ARCSTAT_INCR(arcstat_memory_direct_count, 1); + arc_no_grow = B_TRUE; + arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz); + ARCSTAT_BUMP(arcstat_memory_direct_count); } - arc_kmem_reap_now(strategy); - arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min)) - >> arc_shrink_shift, 0); mutex_exit(&arc_reclaim_thr_lock); - return (arc_reclaim); + return (-1); } SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func); @@ -2343,7 +2446,7 @@ static void arc_adapt(int bytes, arc_state_t *state) { int mult; - uint64_t arc_p_min = (arc_c >> arc_p_min_shift); + uint64_t arc_p_min = (arc_c >> zfs_arc_p_min_shift); if (state == arc_l2c_only) return; @@ -2375,11 +2478,6 @@ arc_adapt(int bytes, arc_state_t *state) } ASSERT((int64_t)arc_p >= 0); - if (arc_reclaim_needed()) { - cv_signal(&arc_reclaim_thr_cv); - return; - } - if (arc_no_grow) return; @@ -2412,19 +2510,7 @@ arc_evict_needed(arc_buf_contents_t type) if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) return (1); -#ifdef _KERNEL - /* - * If zio data pages are being allocated out of a separate heap segment, - * then enforce that the size of available vmem for this area remains - * above about 1/32nd free. - */ - if (type == ARC_BUFC_DATA && zio_arena != NULL && - vmem_size(zio_arena, VMEM_FREE) < - (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) - return (1); -#endif - - if (arc_reclaim_needed()) + if (arc_no_grow) return (1); return (arc_size > arc_c); @@ -2657,7 +2743,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); + ASSERT0(refcount_count(&buf->b_refcnt)); new_state = arc_mru; } @@ -2739,10 +2825,12 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { - arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? - byteswap_uint64_array : - dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; - func(buf->b_data, hdr->b_size); + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + if (BP_GET_LEVEL(zio->io_bp) > 0) + byteswap_uint64_array(buf->b_data, hdr->b_size); + else + dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size); } arc_cksum_compute(buf, B_FALSE); @@ -2761,8 +2849,10 @@ arc_read_done(zio_t *zio) abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done) { - if (abuf == NULL) + if (abuf == NULL) { + ARCSTAT_BUMP(arcstat_duplicate_reads); abuf = arc_buf_clone(buf); + } acb->acb_buf = abuf; abuf = NULL; } @@ -2826,7 +2916,7 @@ arc_read_done(zio_t *zio) } /* - * "Read" the block block at the specified DVA (in bp) via the + * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was @@ -2842,48 +2932,17 @@ arc_read_done(zio_t *zio) * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. - * - * Normal callers should use arc_read and pass the arc buffer and offset - * for the bp. But if you know you don't need locking, you can use - * arc_read_bp. */ int -arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb) -{ - int err; - - if (pbuf == NULL) { - /* - * XXX This happens from traverse callback funcs, for - * the objset_phys_t block. - */ - return (arc_read_nolock(pio, spa, bp, done, private, priority, - zio_flags, arc_flags, zb)); - } - - ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); - ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); - rw_enter(&pbuf->b_data_lock, RW_READER); - - err = arc_read_nolock(pio, spa, bp, done, private, priority, - zio_flags, arc_flags, zb); - rw_exit(&pbuf->b_data_lock); - - return (err); -} - -int -arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb) +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, + void *private, int priority, int zio_flags, uint32_t *arc_flags, + const zbookmark_t *zb) { arc_buf_hdr_t *hdr; arc_buf_t *buf = NULL; kmutex_t *hash_lock; zio_t *rzio; - uint64_t guid = spa_guid(spa); + uint64_t guid = spa_load_guid(spa); top: hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), @@ -2950,6 +3009,8 @@ top: arc_access(hdr, hash_lock); if (*arc_flags & ARC_L2CACHE) hdr->b_flags |= ARC_L2CACHE; + if (*arc_flags & ARC_L2COMPRESS) + hdr->b_flags |= ARC_L2COMPRESS; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), @@ -2990,13 +3051,15 @@ top: } if (*arc_flags & ARC_L2CACHE) hdr->b_flags |= ARC_L2CACHE; + if (*arc_flags & ARC_L2COMPRESS) + hdr->b_flags |= ARC_L2COMPRESS; if (BP_GET_LEVEL(bp) > 0) hdr->b_flags |= ARC_INDIRECT; } else { /* this block is in the ghost cache */ ASSERT(GHOST_STATE(hdr->b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); + ASSERT0(refcount_count(&hdr->b_refcnt)); ASSERT(hdr->b_buf == NULL); /* if this is a prefetch, we don't have a reference */ @@ -3006,6 +3069,8 @@ top: add_reference(hdr, hash_lock, private); if (*arc_flags & ARC_L2CACHE) hdr->b_flags |= ARC_L2CACHE; + if (*arc_flags & ARC_L2COMPRESS) + hdr->b_flags |= ARC_L2COMPRESS; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; @@ -3076,20 +3141,36 @@ top: cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; + cb->l2rcb_compress = hdr->b_l2hdr->b_compress; /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). + * Issue a null zio if the underlying buffer + * was squashed to zero size by compression. */ - rzio = zio_read_phys(pio, vd, addr, size, - buf->b_data, ZIO_CHECKSUM_OFF, - l2arc_read_done, cb, priority, zio_flags | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY, B_FALSE); + if (hdr->b_l2hdr->b_compress == + ZIO_COMPRESS_EMPTY) { + rzio = zio_null(pio, spa, vd, + l2arc_read_done, cb, + zio_flags | ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY); + } else { + rzio = zio_read_phys(pio, vd, addr, + hdr->b_l2hdr->b_asize, + buf->b_data, ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, + zio_flags | ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); + } DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); - ARCSTAT_INCR(arcstat_l2_read_bytes, size); + ARCSTAT_INCR(arcstat_l2_read_bytes, + hdr->b_l2hdr->b_asize); if (*arc_flags & ARC_NOWAIT) { zio_nowait(rzio); @@ -3176,6 +3257,34 @@ arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) } /* + * Notify the arc that a block was freed, and thus will never be used again. + */ +void +arc_freed(spa_t *spa, const blkptr_t *bp) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + uint64_t guid = spa_load_guid(spa); + + hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), + &hash_lock); + if (hdr == NULL) + return; + if (HDR_BUF_AVAILABLE(hdr)) { + arc_buf_t *buf = hdr->b_buf; + add_reference(hdr, hash_lock, FTAG); + hdr->b_flags &= ~ARC_BUF_AVAILABLE; + mutex_exit(hash_lock); + + arc_release(buf, FTAG); + (void) arc_buf_remove_ref(buf, FTAG); + } else { + mutex_exit(hash_lock); + } + +} + +/* * This is used by the DMU to let the ARC know that a buffer is * being evicted, so the ARC should clean up. If this arc buf * is not yet in the evicted state, it will be put there. @@ -3332,6 +3441,16 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } + + /* + * We're releasing a duplicate user data buffer, update + * our statistics accordingly. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, + -hdr->b_size); + } hdr->b_datacnt -= 1; arc_cksum_verify(buf); @@ -3370,26 +3489,15 @@ arc_release(arc_buf_t *buf, void *tag) buf->b_private = NULL; if (l2hdr) { + ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); list_remove(l2hdr->b_dev->l2ad_buflist, hdr); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -buf_size); mutex_exit(&l2arc_buflist_mtx); } } -/* - * Release this buffer. If it does not match the provided BP, fill it - * with that block's contents. - */ -/* ARGSUSED */ -int -arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, - zbookmark_t *zb) -{ - arc_release(buf, tag); - return (0); -} - int arc_released(arc_buf_t *buf) { @@ -3526,9 +3634,9 @@ arc_write_done(zio_t *zio) zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, - arc_done_func_t *ready, arc_done_func_t *done, void *private, - int priority, int zio_flags, const zbookmark_t *zb) + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, + const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done, + void *private, int priority, int zio_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; @@ -3541,7 +3649,9 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(hdr->b_acb == NULL); if (l2arc) hdr->b_flags |= ARC_L2CACHE; - callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); + if (l2arc_compress) + hdr->b_flags |= ARC_L2COMPRESS; + callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE); callback->awcb_ready = ready; callback->awcb_done = done; callback->awcb_private = private; @@ -3557,48 +3667,19 @@ static int arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) { #ifdef _KERNEL - uint64_t available_memory = ptob(freemem); - static uint64_t page_load = 0; - static uint64_t last_txg = 0; + uint64_t available_memory; -#if defined(__i386) - available_memory = - MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); -#endif - if (available_memory >= zfs_write_limit_max) + if (zfs_arc_memory_throttle_disable) return (0); - if (txg > last_txg) { - last_txg = txg; - page_load = 0; - } - /* - * If we are in pageout, we know that memory is already tight, - * the arc is already going to be evicting, so we just want to - * continue to let page writes occur as quickly as possible. - */ - if (curproc == proc_pageout) { - if (page_load > MAX(ptob(minfree), available_memory) / 4) - return (ERESTART); - /* Note: reserve is inflated, so we deflate */ - page_load += reserve / 8; - return (0); - } else if (page_load > 0 && arc_reclaim_needed()) { - /* memory is low, delay before restarting */ + /* Easily reclaimable memory (free + inactive + arc-evictable) */ + available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory(); + + if (available_memory <= zfs_write_limit_max) { ARCSTAT_INCR(arcstat_memory_throttle_count, 1); DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); return (EAGAIN); } - page_load = 0; - - if (arc_size > arc_c_min) { - uint64_t evictable_memory = - arc_mru->arcs_lsize[ARC_BUFC_DATA] + - arc_mru->arcs_lsize[ARC_BUFC_METADATA] + - arc_mfu->arcs_lsize[ARC_BUFC_DATA] + - arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; - available_memory += MIN(evictable_memory, arc_size - arc_c_min); - } if (inflight_data > available_memory / 4) { ARCSTAT_INCR(arcstat_memory_throttle_count, 1); @@ -3725,7 +3806,7 @@ arc_init(void) cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); /* Convert seconds to clock ticks */ - arc_min_prefetch_lifespan = 1 * hz; + zfs_arc_min_prefetch_lifespan = 1 * hz; /* Start out with 1/8 of all memory */ arc_c = physmem * PAGESIZE / 8; @@ -3747,11 +3828,7 @@ arc_init(void) /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ arc_c_min = MAX(arc_c / 4, 64<<20); - /* set max to 1/2 of all memory, or all but 4GB, whichever is more */ - if (arc_c * 8 >= ((uint64_t)4<<30)) - arc_c_max = (arc_c * 8) - ((uint64_t)4<<30); - else - arc_c_max = arc_c_min; + /* set max to 1/2 of all memory */ arc_c_max = MAX(arc_c * 4, arc_c_max); /* @@ -3777,18 +3854,6 @@ arc_init(void) if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; - if (zfs_arc_grow_retry > 0) - arc_grow_retry = zfs_arc_grow_retry; - - if (zfs_arc_shrink_shift > 0) - arc_shrink_shift = zfs_arc_shrink_shift; - - if (zfs_arc_p_min_shift > 0) - arc_p_min_shift = zfs_arc_p_min_shift; - - if (zfs_arc_meta_prune > 0) - arc_meta_prune = zfs_arc_meta_prune; - /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -3850,7 +3915,7 @@ arc_init(void) kstat_install(arc_ksp); } - (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, + (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0, TS_RUN, minclsyspri); arc_dead = FALSE; @@ -3980,8 +4045,12 @@ arc_fini(void) * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are - * not already there. It scans until a headroom of buffers is satisfied, - * which itself is a buffer for ARC eviction. The thread that does this is + * not already there. It scans until a headroom of buffers is satisfied, + * which itself is a buffer for ARC eviction. If a compressible buffer is + * found during scanning and selected for writing to an L2ARC device, we + * temporarily boost scanning headroom during the next scan cycle to make + * sure we adapt to compression effects (which might significantly reduce + * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * @@ -4045,7 +4114,13 @@ arc_fini(void) * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers + * l2arc_nocompress skip compressing buffers * l2arc_headroom number of max device writes to precache + * l2arc_headroom_boost when we find compressed buffers during ARC + * scanning, we multiply headroom by this + * percentage factor for the next scan cycle, + * since more compressed buffers are likely to + * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are @@ -4079,14 +4154,24 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) } static uint64_t -l2arc_write_size(l2arc_dev_t *dev) +l2arc_write_size(void) { uint64_t size; - size = dev->l2ad_write; + /* + * Make sure our globals have meaningful values in case the user + * altered them. + */ + size = l2arc_write_max; + if (size == 0) { + cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " + "be greater than zero, resetting it to the default (%d)", + L2ARC_WRITE_SIZE); + size = l2arc_write_max = L2ARC_WRITE_SIZE; + } if (arc_warm == B_FALSE) - size += dev->l2ad_boost; + size += l2arc_write_boost; return (size); @@ -4117,14 +4202,14 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) static void l2arc_hdr_stat_add(void) { - ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); + ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE); ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); } static void l2arc_hdr_stat_remove(void) { - ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); + ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE); ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); } @@ -4260,14 +4345,23 @@ l2arc_write_done(zio_t *zio) continue; } + abl2 = ab->b_l2hdr; + + /* + * Release the temporary compressed buffer as soon as possible. + */ + if (abl2->b_compress != ZIO_COMPRESS_OFF) + l2arc_release_cdata_buf(ab); + if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, ab); - abl2 = ab->b_l2hdr; + ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); ab->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } @@ -4318,6 +4412,13 @@ l2arc_read_done(zio_t *zio) ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* + * If the buffer was compressed, decompress it first. + */ + if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) + l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); + ASSERT(zio->io_data != NULL); + + /* * Check this survived the L2ARC journey. */ equal = arc_cksum_equal(buf); @@ -4512,8 +4613,10 @@ top: */ if (ab->b_l2hdr != NULL) { abl2 = ab->b_l2hdr; + ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); ab->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } list_remove(buflist, ab); @@ -4537,37 +4640,54 @@ top: * * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. + * The headroom_boost is an in-out parameter used to maintain headroom boost + * state between calls to this function. + * + * Returns the number of bytes actually written (which may be smaller than + * the delta by which the device hand has changed due to alignment). */ static uint64_t -l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, + boolean_t *headroom_boost) { arc_buf_hdr_t *ab, *ab_prev, *head; - l2arc_buf_hdr_t *hdrl2; list_t *list; - uint64_t passed_sz, write_sz, buf_sz, headroom; + uint64_t write_asize, write_psize, write_sz, headroom, + buf_compress_minsz; void *buf_data; - kmutex_t *hash_lock, *list_lock = NULL; - boolean_t have_lock, full; + kmutex_t *list_lock = NULL; + boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; - uint64_t guid = spa_guid(spa); + uint64_t guid = spa_load_guid(spa); int try; + const boolean_t do_headroom_boost = *headroom_boost; ASSERT(dev->l2ad_vdev != NULL); + /* Lower the flag now, we might want to raise it again later. */ + *headroom_boost = B_FALSE; + pio = NULL; - write_sz = 0; + write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); head->b_flags |= ARC_L2_WRITE_HEAD; /* + * We will want to try to compress buffers that are at least 2x the + * device sector size. + */ + buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; + + /* * Copy buffers for L2ARC writing. */ mutex_enter(&l2arc_buflist_mtx); for (try = 0; try <= 3; try++) { + uint64_t passed_sz = 0; + list = l2arc_list_locked(try, &list_lock); - passed_sz = 0; /* * L2ARC fast warmup. @@ -4575,21 +4695,27 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ - headroom = target_sz * l2arc_headroom; if (arc_warm == B_FALSE) ab = list_head(list); else ab = list_tail(list); + headroom = target_sz * l2arc_headroom; + if (do_headroom_boost) + headroom = (headroom * l2arc_headroom_boost) / 100; + for (; ab; ab = ab_prev) { + l2arc_buf_hdr_t *l2hdr; + kmutex_t *hash_lock; + uint64_t buf_sz; + if (arc_warm == B_FALSE) ab_prev = list_next(list, ab); else ab_prev = list_prev(list, ab); hash_lock = HDR_LOCK(ab); - have_lock = MUTEX_HELD(hash_lock); - if (!have_lock && !mutex_tryenter(hash_lock)) { + if (!mutex_tryenter(hash_lock)) { /* * Skip this buffer rather than waiting. */ @@ -4635,16 +4761,29 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) /* * Create and add a new L2ARC header. */ - hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), - KM_PUSHPAGE); - hdrl2->b_dev = dev; - hdrl2->b_daddr = dev->l2ad_hand; + l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), + KM_PUSHPAGE); + l2hdr->b_dev = dev; + arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS); ab->b_flags |= ARC_L2_WRITING; - ab->b_l2hdr = hdrl2; - list_insert_head(dev->l2ad_buflist, ab); - buf_data = ab->b_buf->b_data; + + /* + * Temporarily stash the data buffer in b_tmp_cdata. + * The subsequent write step will pick it up from + * there. This is because can't access ab->b_buf + * without holding the hash_lock, which we in turn + * can't access without holding the ARC list locks + * (which we want to avoid during compression/writing) + */ + l2hdr->b_compress = ZIO_COMPRESS_OFF; + l2hdr->b_asize = ab->b_size; + l2hdr->b_tmp_cdata = ab->b_buf->b_data; + buf_sz = ab->b_size; + ab->b_l2hdr = l2hdr; + + list_insert_head(dev->l2ad_buflist, ab); /* * Compute and store the buffer cksum before @@ -4655,6 +4794,64 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) mutex_exit(hash_lock); + write_sz += buf_sz; + } + + mutex_exit(list_lock); + + if (full == B_TRUE) + break; + } + + /* No buffers selected for writing? */ + if (pio == NULL) { + ASSERT0(write_sz); + mutex_exit(&l2arc_buflist_mtx); + kmem_cache_free(hdr_cache, head); + return (0); + } + + /* + * Now start writing the buffers. We're starting at the write head + * and work backwards, retracing the course of the buffer selector + * loop above. + */ + for (ab = list_prev(dev->l2ad_buflist, head); ab; + ab = list_prev(dev->l2ad_buflist, ab)) { + l2arc_buf_hdr_t *l2hdr; + uint64_t buf_sz; + + /* + * We shouldn't need to lock the buffer here, since we flagged + * it as ARC_L2_WRITING in the previous step, but we must take + * care to only access its L2 cache parameters. In particular, + * ab->b_buf may be invalid by now due to ARC eviction. + */ + l2hdr = ab->b_l2hdr; + l2hdr->b_daddr = dev->l2ad_hand; + + if (!l2arc_nocompress && (ab->b_flags & ARC_L2COMPRESS) && + l2hdr->b_asize >= buf_compress_minsz) { + if (l2arc_compress_buf(l2hdr)) { + /* + * If compression succeeded, enable headroom + * boost on the next scan cycle. + */ + *headroom_boost = B_TRUE; + } + } + + /* + * Pick up the buffer data we had previously stashed away + * (and now potentially also compressed). + */ + buf_data = l2hdr->b_tmp_cdata; + buf_sz = l2hdr->b_asize; + + /* Compression may have squashed the buffer to zero length. */ + if (buf_sz != 0) { + uint64_t buf_p_sz; + wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, @@ -4664,33 +4861,24 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) zio_t *, wzio); (void) zio_nowait(wzio); + write_asize += buf_sz; /* * Keep the clock hand suitably device-aligned. */ - buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); - - write_sz += buf_sz; - dev->l2ad_hand += buf_sz; + buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); + write_psize += buf_p_sz; + dev->l2ad_hand += buf_p_sz; } - - mutex_exit(list_lock); - - if (full == B_TRUE) - break; } - mutex_exit(&l2arc_buflist_mtx); - if (pio == NULL) { - ASSERT3U(write_sz, ==, 0); - kmem_cache_free(hdr_cache, head); - return (0); - } + mutex_exit(&l2arc_buflist_mtx); - ASSERT3U(write_sz, <=, target_sz); + ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); - ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); + ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); - vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); + ARCSTAT_INCR(arcstat_l2_asize, write_asize); + vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. @@ -4708,7 +4896,153 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; - return (write_sz); + return (write_asize); +} + +/* + * Compresses an L2ARC buffer. + * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its + * size in l2hdr->b_asize. This routine tries to compress the data and + * depending on the compression result there are three possible outcomes: + * *) The buffer was incompressible. The original l2hdr contents were left + * untouched and are ready for writing to an L2 device. + * *) The buffer was all-zeros, so there is no need to write it to an L2 + * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is + * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. + * *) Compression succeeded and b_tmp_cdata was replaced with a temporary + * data buffer which holds the compressed data to be written, and b_asize + * tells us how much data there is. b_compress is set to the appropriate + * compression algorithm. Once writing is done, invoke + * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. + * + * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the + * buffer was incompressible). + */ +static boolean_t +l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) +{ + void *cdata; + size_t csize, len; + + ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); + ASSERT(l2hdr->b_tmp_cdata != NULL); + + len = l2hdr->b_asize; + cdata = zio_data_buf_alloc(len); + csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, + cdata, l2hdr->b_asize); + + if (csize == 0) { + /* zero block, indicate that there's nothing to write */ + zio_data_buf_free(cdata, len); + l2hdr->b_compress = ZIO_COMPRESS_EMPTY; + l2hdr->b_asize = 0; + l2hdr->b_tmp_cdata = NULL; + ARCSTAT_BUMP(arcstat_l2_compress_zeros); + return (B_TRUE); + } else if (csize > 0 && csize < len) { + /* + * Compression succeeded, we'll keep the cdata around for + * writing and release it afterwards. + */ + l2hdr->b_compress = ZIO_COMPRESS_LZ4; + l2hdr->b_asize = csize; + l2hdr->b_tmp_cdata = cdata; + ARCSTAT_BUMP(arcstat_l2_compress_successes); + return (B_TRUE); + } else { + /* + * Compression failed, release the compressed buffer. + * l2hdr will be left unmodified. + */ + zio_data_buf_free(cdata, len); + ARCSTAT_BUMP(arcstat_l2_compress_failures); + return (B_FALSE); + } +} + +/* + * Decompresses a zio read back from an l2arc device. On success, the + * underlying zio's io_data buffer is overwritten by the uncompressed + * version. On decompression error (corrupt compressed stream), the + * zio->io_error value is set to signal an I/O error. + * + * Please note that the compressed data stream is not checksummed, so + * if the underlying device is experiencing data corruption, we may feed + * corrupt data to the decompressor, so the decompressor needs to be + * able to handle this situation (LZ4 does). + */ +static void +l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) +{ + uint64_t csize; + void *cdata; + + ASSERT(L2ARC_IS_VALID_COMPRESS(c)); + + if (zio->io_error != 0) { + /* + * An io error has occured, just restore the original io + * size in preparation for a main pool read. + */ + zio->io_orig_size = zio->io_size = hdr->b_size; + return; + } + + if (c == ZIO_COMPRESS_EMPTY) { + /* + * An empty buffer results in a null zio, which means we + * need to fill its io_data after we're done restoring the + * buffer's contents. + */ + ASSERT(hdr->b_buf != NULL); + bzero(hdr->b_buf->b_data, hdr->b_size); + zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; + } else { + ASSERT(zio->io_data != NULL); + /* + * We copy the compressed data from the start of the arc buffer + * (the zio_read will have pulled in only what we need, the + * rest is garbage which we will overwrite at decompression) + * and then decompress back to the ARC data buffer. This way we + * can minimize copying by simply decompressing back over the + * original compressed data (rather than decompressing to an + * aux buffer and then copying back the uncompressed buffer, + * which is likely to be much larger). + */ + csize = zio->io_size; + cdata = zio_data_buf_alloc(csize); + bcopy(zio->io_data, cdata, csize); + if (zio_decompress_data(c, cdata, zio->io_data, csize, + hdr->b_size) != 0) + zio->io_error = EIO; + zio_data_buf_free(cdata, csize); + } + + /* Restore the expected uncompressed IO size. */ + zio->io_orig_size = zio->io_size = hdr->b_size; +} + +/* + * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. + * This buffer serves as a temporary holder of compressed data while + * the buffer entry is being written to an l2arc device. Once that is + * done, we can dispose of it. + */ +static void +l2arc_release_cdata_buf(arc_buf_hdr_t *ab) +{ + l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr; + + if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) { + /* + * If the data was compressed, then we've allocated a + * temporary buffer for it, so now we need to release it. + */ + ASSERT(l2hdr->b_tmp_cdata != NULL); + zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size); + } + l2hdr->b_tmp_cdata = NULL; } /* @@ -4723,6 +5057,7 @@ l2arc_feed_thread(void) spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); + boolean_t headroom_boost = B_FALSE; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); @@ -4775,7 +5110,7 @@ l2arc_feed_thread(void) /* * Avoid contributing to memory pressure. */ - if (arc_reclaim_needed()) { + if (arc_no_grow) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; @@ -4783,7 +5118,7 @@ l2arc_feed_thread(void) ARCSTAT_BUMP(arcstat_l2_feeds); - size = l2arc_write_size(dev); + size = l2arc_write_size(); /* * Evict L2ARC buffers that will be overwritten. @@ -4793,7 +5128,7 @@ l2arc_feed_thread(void) /* * Write ARC buffers. */ - wrote = l2arc_write_buffers(spa, dev, size); + wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); /* * Calculate interval between writes. @@ -4841,8 +5176,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; - adddev->l2ad_write = l2arc_write_max; - adddev->l2ad_boost = l2arc_write_boost; adddev->l2ad_start = VDEV_LABEL_START_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; @@ -4850,7 +5183,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; list_link_init(&adddev->l2ad_node); - ASSERT3U(adddev->l2ad_write, >, 0); /* * This is a list of all ARC buffers that are still valid on the @@ -4983,49 +5315,64 @@ EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); -module_param(zfs_arc_min, ulong, 0444); +module_param(zfs_arc_min, ulong, 0644); MODULE_PARM_DESC(zfs_arc_min, "Min arc size"); -module_param(zfs_arc_max, ulong, 0444); +module_param(zfs_arc_max, ulong, 0644); MODULE_PARM_DESC(zfs_arc_max, "Max arc size"); -module_param(zfs_arc_meta_limit, ulong, 0444); +module_param(zfs_arc_meta_limit, ulong, 0644); MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size"); -module_param(zfs_arc_meta_prune, int, 0444); +module_param(zfs_arc_meta_prune, int, 0644); MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune"); -module_param(zfs_arc_grow_retry, int, 0444); +module_param(zfs_arc_grow_retry, int, 0644); MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); -module_param(zfs_arc_shrink_shift, int, 0444); +module_param(zfs_arc_shrink_shift, int, 0644); MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)"); -module_param(zfs_arc_p_min_shift, int, 0444); +module_param(zfs_arc_p_min_shift, int, 0644); MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p"); -module_param(l2arc_write_max, ulong, 0444); +module_param(zfs_disable_dup_eviction, int, 0644); +MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction"); + +module_param(zfs_arc_memory_throttle_disable, int, 0644); +MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle"); + +module_param(zfs_arc_min_prefetch_lifespan, int, 0644); +MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block"); + +module_param(l2arc_write_max, ulong, 0644); MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval"); -module_param(l2arc_write_boost, ulong, 0444); +module_param(l2arc_write_boost, ulong, 0644); MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup"); -module_param(l2arc_headroom, ulong, 0444); +module_param(l2arc_headroom, ulong, 0644); MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache"); -module_param(l2arc_feed_secs, ulong, 0444); +module_param(l2arc_headroom_boost, ulong, 0644); +MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier"); + +module_param(l2arc_feed_secs, ulong, 0644); MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing"); -module_param(l2arc_feed_min_ms, ulong, 0444); +module_param(l2arc_feed_min_ms, ulong, 0644); MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds"); -module_param(l2arc_noprefetch, int, 0444); +module_param(l2arc_noprefetch, int, 0644); MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers"); -module_param(l2arc_feed_again, int, 0444); +module_param(l2arc_nocompress, int, 0644); +MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers"); + +module_param(l2arc_feed_again, int, 0644); MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup"); -module_param(l2arc_norw, int, 0444); +module_param(l2arc_norw, int, 0644); MODULE_PARM_DESC(l2arc_norw, "No reads during writes"); #endif