X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Farc.c;h=ff631e61b2eaa2eaf04d2ad1712f6f17db554060;hb=5547c2f1bf49802835fd6c52f15115ba344a2a8b;hp=8adb54dc6e195748d56578f3aa1e8eb213a42f86;hpb=428870ff734fdaccc342b33fc53cf94724409a46;p=zfs.git diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 8adb54d..ff631e6 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -174,12 +174,13 @@ static boolean_t arc_warm; /* * These tunables are for performance analysis. */ -uint64_t zfs_arc_max; -uint64_t zfs_arc_min; -uint64_t zfs_arc_meta_limit = 0; +unsigned long zfs_arc_max = 0; +unsigned long zfs_arc_min = 0; +unsigned long zfs_arc_meta_limit = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; +int zfs_arc_reduce_dnlc_percent = 0; /* * Note that buffers can be in one of 6 states: @@ -282,6 +283,14 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_memory_direct_count; + kstat_named_t arcstat_memory_indirect_count; + kstat_named_t arcstat_no_grow; + kstat_named_t arcstat_tempreserve; + kstat_named_t arcstat_loaned_bytes; + kstat_named_t arcstat_meta_used; + kstat_named_t arcstat_meta_limit; + kstat_named_t arcstat_meta_max; } arc_stats_t; static arc_stats_t arc_stats = { @@ -337,7 +346,15 @@ static arc_stats_t arc_stats = { { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 } + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "memory_direct_count", KSTAT_DATA_UINT64 }, + { "memory_indirect_count", KSTAT_DATA_UINT64 }, + { "arc_no_grow", KSTAT_DATA_UINT64 }, + { "arc_tempreserve", KSTAT_DATA_UINT64 }, + { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, + { "arc_meta_used", KSTAT_DATA_UINT64 }, + { "arc_meta_limit", KSTAT_DATA_UINT64 }, + { "arc_meta_max", KSTAT_DATA_UINT64 }, }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -399,13 +416,12 @@ static arc_state_t *arc_l2c_only; #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ - -static int arc_no_grow; /* Don't try to grow cache size */ -static uint64_t arc_tempreserve; -static uint64_t arc_loaned_bytes; -static uint64_t arc_meta_used; -static uint64_t arc_meta_limit; -static uint64_t arc_meta_max = 0; +#define arc_no_grow ARCSTAT(arcstat_no_grow) +#define arc_tempreserve ARCSTAT(arcstat_tempreserve) +#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) +#define arc_meta_used ARCSTAT(arcstat_meta_used) +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) +#define arc_meta_max ARCSTAT(arcstat_meta_max) typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; @@ -523,12 +539,13 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); * Hash table routines */ -#define HT_LOCK_PAD 64 +#define HT_LOCK_ALIGN 64 +#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN))) struct ht_lock { kmutex_t ht_lock; #ifdef _KERNEL - unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; + unsigned char pad[HT_LOCK_PAD]; #endif }; @@ -565,14 +582,14 @@ uint64_t zfs_crc64_table[256]; /* * L2ARC Performance Tunables */ -uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ -uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ -uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ -uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ -boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ -boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ -boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ +unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ +unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ +unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ +unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ +int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +int l2arc_feed_again = B_TRUE; /* turbo warmup */ +int l2arc_norw = B_TRUE; /* no reads during writes */ /* * L2ARC Internals @@ -772,8 +789,15 @@ buf_fini(void) { int i; +#if defined(_KERNEL) && defined(HAVE_SPL) + /* Large allocations which do not require contiguous pages + * should be using vmem_free() in the linux kernel */ + vmem_free(buf_hash_table.ht_table, + (buf_hash_table.ht_mask + 1) * sizeof (void *)); +#else kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); +#endif for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_cache); @@ -794,6 +818,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag) refcount_create(&buf->b_refcnt); cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + list_link_init(&buf->b_arc_node); + list_link_init(&buf->b_l2node); arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); return (0); @@ -873,8 +899,15 @@ buf_init(void) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; +#if defined(_KERNEL) && defined(HAVE_SPL) + /* Large allocations which do not require contiguous pages + * should be using vmem_alloc() in the linux kernel */ + buf_hash_table.ht_table = + vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); +#else buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); +#endif if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; @@ -952,11 +985,6 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) void arc_buf_thaw(arc_buf_t *buf) { - kmutex_t *hash_lock; - - hash_lock = HDR_LOCK(buf->b_hdr); - mutex_enter(hash_lock); - if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_state != arc_anon) panic("modifying non-anon buffer!"); @@ -978,7 +1006,6 @@ arc_buf_thaw(arc_buf_t *buf) } mutex_exit(&buf->b_hdr->b_freeze_lock); - mutex_exit(hash_lock); } void @@ -1149,6 +1176,8 @@ arc_space_consume(uint64_t space, arc_space_type_t type) ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { + default: + break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; @@ -1173,6 +1202,8 @@ arc_space_return(uint64_t space, arc_space_type_t type) ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { + default: + break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; @@ -1432,10 +1463,11 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { + l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; + ASSERT(refcount_is_zero(&hdr->b_refcnt)); ASSERT3P(hdr->b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; if (l2hdr != NULL) { boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); @@ -1709,7 +1741,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, mutex_exit(&state->arcs_mtx); if (bytes_evicted < bytes) - dprintf("only evicted %lld bytes from %x", + dprintf("only evicted %lld bytes from %x\n", (longlong_t)bytes_evicted, state); if (skipped) @@ -1730,12 +1762,12 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { int64_t todelete = MIN(arc_mru_ghost->arcs_lsize[type], mru_over); - arc_evict_ghost(arc_mru_ghost, NULL, todelete); + arc_evict_ghost(arc_mru_ghost, 0, todelete); } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c); - arc_evict_ghost(arc_mfu_ghost, NULL, todelete); + arc_evict_ghost(arc_mfu_ghost, 0, todelete); } } @@ -1750,18 +1782,25 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t marker; list_t *list = &state->arcs_list[ARC_BUFC_DATA]; kmutex_t *hash_lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; ASSERT(GHOST_STATE(state)); + bzero(&marker, sizeof(marker)); top: mutex_enter(&state->arcs_mtx); for (ab = list_tail(list); ab; ab = ab_prev) { ab_prev = list_prev(list, ab); if (spa && ab->b_spa != spa) continue; + + /* ignore markers */ + if (ab->b_spa == 0) + continue; + hash_lock = HDR_LOCK(ab); /* caller may be trying to modify this buffer, skip it */ if (MUTEX_HELD(hash_lock)) @@ -1788,15 +1827,21 @@ top: DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); if (bytes >= 0 && bytes_deleted >= bytes) break; - } else { - if (bytes < 0) { - mutex_exit(&state->arcs_mtx); - mutex_enter(hash_lock); - mutex_exit(hash_lock); - goto top; - } + } else if (bytes < 0) { + /* + * Insert a list marker and then wait for the + * hash lock to become available. Once its + * available, restart from where we left off. + */ + list_insert_after(list, ab, &marker); + mutex_exit(&state->arcs_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + mutex_enter(&state->arcs_mtx); + ab_prev = list_prev(list, &marker); + list_remove(list, &marker); + } else bufs_skipped += 1; - } } mutex_exit(&state->arcs_mtx); @@ -1812,7 +1857,7 @@ top: } if (bytes_deleted < bytes) - dprintf("only deleted %lld bytes from %p", + dprintf("only deleted %lld bytes from %p\n", (longlong_t)bytes_deleted, state); } @@ -1825,18 +1870,19 @@ arc_adjust(void) * Adjust MRU size */ - adjustment = MIN(arc_size - arc_c, - arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p); + adjustment = MIN((int64_t)(arc_size - arc_c), + (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - + arc_p)); if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); - (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); adjustment -= delta; } if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); - (void) arc_evict(arc_mru, NULL, delta, FALSE, + (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); } @@ -1848,14 +1894,14 @@ arc_adjust(void) if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); - (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); adjustment -= delta; } if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { int64_t delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); - (void) arc_evict(arc_mfu, NULL, delta, FALSE, + (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); } @@ -1867,7 +1913,7 @@ arc_adjust(void) if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { delta = MIN(arc_mru_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mru_ghost, NULL, delta); + arc_evict_ghost(arc_mru_ghost, 0, delta); } adjustment = @@ -1875,7 +1921,7 @@ arc_adjust(void) if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { delta = MIN(arc_mfu_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mfu_ghost, NULL, delta); + arc_evict_ghost(arc_mfu_ghost, 0, delta); } } @@ -1976,9 +2022,8 @@ arc_shrink(void) static int arc_reclaim_needed(void) { - uint64_t extra; - #ifdef _KERNEL + uint64_t extra; if (needfree) return (1); @@ -2040,14 +2085,16 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) kmem_cache_t *prev_data_cache = NULL; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; - #ifdef _KERNEL - if (arc_meta_used >= arc_meta_limit) { + int retry = 0; + + while ((arc_meta_used >= arc_meta_limit) && (retry < 10)) { /* * We are exceeding our meta-data cache limit. * Purge some DNLC entries to release holds on meta-data. */ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); + retry++; } #if defined(__i386) /* @@ -2113,16 +2160,18 @@ arc_reclaim_thread(void) arc_no_grow = FALSE; } - if (2 * arc_c < arc_size + - arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) - arc_adjust(); + /* Keep meta data usage within limits */ + if (arc_meta_used >= arc_meta_limit) + arc_kmem_reap_now(ARC_RECLAIM_CONS); + + arc_adjust(); if (arc_eviction_list != NULL) arc_do_user_evicts(); /* block until needed, or one second, whichever is shorter */ CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&arc_reclaim_thr_cv, + (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); } @@ -2133,6 +2182,59 @@ arc_reclaim_thread(void) thread_exit(); } +#ifdef _KERNEL +/* + * Under Linux the arc shrinker may be called for synchronous (direct) + * reclaim, or asynchronous (indirect) reclaim. When called by kswapd + * for indirect reclaim we take a conservative approach and just reap + * free slabs from the ARC caches. If this proves to be insufficient + * direct reclaim will be trigger. In direct reclaim a more aggressive + * strategy is used, data is evicted from the ARC and free slabs reaped. + */ +static int +__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) +{ + arc_reclaim_strategy_t strategy; + int arc_reclaim; + + /* Return number of reclaimable pages based on arc_shrink_shift */ + arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min)) + >> arc_shrink_shift, 0); + if (sc->nr_to_scan == 0) + return (arc_reclaim); + + /* Prevent reclaim below arc_c_min */ + if (arc_reclaim <= 0) + return (-1); + + /* Not allowed to perform filesystem reclaim */ + if (!(sc->gfp_mask & __GFP_FS)) + return (-1); + + /* Reclaim in progress */ + if (mutex_tryenter(&arc_reclaim_thr_lock) == 0) + return (-1); + + if (current_is_kswapd()) { + strategy = ARC_RECLAIM_CONS; + ARCSTAT_INCR(arcstat_memory_indirect_count, 1); + } else { + strategy = ARC_RECLAIM_AGGR; + ARCSTAT_INCR(arcstat_memory_direct_count, 1); + } + + arc_kmem_reap_now(strategy); + arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min)) + >> arc_shrink_shift, 0); + mutex_exit(&arc_reclaim_thr_lock); + + return (arc_reclaim); +} +SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func); + +SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS); +#endif /* _KERNEL */ + /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called @@ -2159,6 +2261,7 @@ arc_adapt(int bytes, arc_state_t *state) if (state == arc_mru_ghost) { mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); + mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { @@ -2166,6 +2269,7 @@ arc_adapt(int bytes, arc_state_t *state) mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); + mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); arc_p = MAX(arc_p_min, arc_p - delta); @@ -2295,7 +2399,7 @@ arc_get_data_buf(arc_buf_t *buf) state = (arc_mru->arcs_lsize[type] >= size && mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } - if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { + if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); @@ -2666,7 +2770,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, uint32_t *arc_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr; - arc_buf_t *buf; + arc_buf_t *buf = NULL; kmutex_t *hash_lock; zio_t *rzio; uint64_t guid = spa_guid(spa); @@ -2691,7 +2795,7 @@ top: arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), - KM_SLEEP); + KM_PUSHPAGE); acb->acb_done = done; acb->acb_private = private; if (pio != NULL) @@ -2748,7 +2852,7 @@ top: uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; - uint64_t addr; + uint64_t addr = -1; boolean_t devw = B_FALSE; if (hdr == NULL) { @@ -2807,7 +2911,7 @@ top: ASSERT(!GHOST_STATE(hdr->b_state)); - acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); + acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE); acb->acb_done = done; acb->acb_private = private; @@ -2856,7 +2960,7 @@ top: ARCSTAT_BUMP(arcstat_l2_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), - KM_SLEEP); + KM_PUSHPAGE); cb->l2rcb_buf = buf; cb->l2rcb_spa = spa; cb->l2rcb_bp = *bp; @@ -3026,7 +3130,7 @@ arc_release(arc_buf_t *buf, void *tag) arc_buf_hdr_t *hdr; kmutex_t *hash_lock = NULL; l2arc_buf_hdr_t *l2hdr; - uint64_t buf_size; + uint64_t buf_size = 0; /* * It would be nice to assert that if it's DMU metadata (level > @@ -3401,7 +3505,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) * in order to compress/encrypt/etc the data. We therefor need to * make sure that there is sufficient available memory for this. */ - if (error = arc_memory_throttle(reserve, anon_size, txg)) + if ((error = arc_memory_throttle(reserve, anon_size, txg))) return (error); /* @@ -3445,6 +3549,12 @@ arc_init(void) * need to limit the cache to 1/8 of VM size. */ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); + /* + * Register a shrinker to support synchronous (direct) memory + * reclaim from the arc. This is done to prevent kswapd from + * swapping out pages when it is preferable to shrink the arc. + */ + spl_register_shrinker(&arc_shrinker); #endif /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ @@ -3470,6 +3580,7 @@ arc_init(void) /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; + arc_meta_max = 0; /* Allow the tunable to override if it is reasonable */ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) @@ -3487,6 +3598,9 @@ arc_init(void) if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift; + if (zfs_arc_reduce_dnlc_percent > 0) + arc_reduce_dnlc_percent = zfs_arc_reduce_dnlc_percent; + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -3561,6 +3675,10 @@ void arc_fini(void) { mutex_enter(&arc_reclaim_thr_lock); +#ifdef _KERNEL + spl_unregister_shrinker(&arc_shrinker); +#endif /* _KERNEL */ + arc_thread_exit = 1; while (arc_thread_exit != 0) cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); @@ -3870,7 +3988,7 @@ out: * Free buffers that were tagged for destruction. */ static void -l2arc_do_free_on_write() +l2arc_do_free_on_write(void) { list_t *buflist; l2arc_data_free_t *df, *df_prev; @@ -4050,7 +4168,7 @@ l2arc_read_done(zio_t *zio) static list_t * l2arc_list_locked(int list_num, kmutex_t **lock) { - list_t *list; + list_t *list = NULL; ASSERT(list_num >= 0 && list_num <= 3); @@ -4223,11 +4341,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) list_t *list; uint64_t passed_sz, write_sz, buf_sz, headroom; void *buf_data; - kmutex_t *hash_lock, *list_lock; + kmutex_t *hash_lock, *list_lock = NULL; boolean_t have_lock, full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_guid(spa); + int try; ASSERT(dev->l2ad_vdev != NULL); @@ -4241,7 +4360,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) * Copy buffers for L2ARC writing. */ mutex_enter(&l2arc_buflist_mtx); - for (int try = 0; try <= 3; try++) { + for (try = 0; try <= 3; try++) { list = l2arc_list_locked(try, &list_lock); passed_sz = 0; @@ -4405,8 +4524,8 @@ l2arc_feed_thread(void) while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, - next); + (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv, + &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; @@ -4438,6 +4557,16 @@ l2arc_feed_thread(void) ASSERT(spa != NULL); /* + * If the pool is read-only then force the feed thread to + * sleep a little longer. + */ + if (!spa_writeable(spa)) { + next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + /* * Avoid contributing to memory pressure. */ if (arc_reclaim_needed()) { @@ -4514,6 +4643,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; + list_link_init(&adddev->l2ad_node); ASSERT3U(adddev->l2ad_write, >, 0); /* @@ -4639,3 +4769,55 @@ l2arc_stop(void) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } + +#if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(arc_read); +EXPORT_SYMBOL(arc_buf_remove_ref); +EXPORT_SYMBOL(arc_getbuf_func); + +module_param(zfs_arc_min, ulong, 0444); +MODULE_PARM_DESC(zfs_arc_min, "Min arc size"); + +module_param(zfs_arc_max, ulong, 0444); +MODULE_PARM_DESC(zfs_arc_max, "Max arc size"); + +module_param(zfs_arc_meta_limit, ulong, 0444); +MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size"); + +module_param(zfs_arc_reduce_dnlc_percent, int, 0444); +MODULE_PARM_DESC(zfs_arc_reduce_dnlc_percent, "Meta reclaim percentage"); + +module_param(zfs_arc_grow_retry, int, 0444); +MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); + +module_param(zfs_arc_shrink_shift, int, 0444); +MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)"); + +module_param(zfs_arc_p_min_shift, int, 0444); +MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p"); + +module_param(l2arc_write_max, ulong, 0444); +MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval"); + +module_param(l2arc_write_boost, ulong, 0444); +MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup"); + +module_param(l2arc_headroom, ulong, 0444); +MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache"); + +module_param(l2arc_feed_secs, ulong, 0444); +MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing"); + +module_param(l2arc_feed_min_ms, ulong, 0444); +MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds"); + +module_param(l2arc_noprefetch, int, 0444); +MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers"); + +module_param(l2arc_feed_again, int, 0444); +MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup"); + +module_param(l2arc_norw, int, 0444); +MODULE_PARM_DESC(l2arc_norw, "No reads during writes"); + +#endif