/*
* These tunables are for performance analysis.
*/
-uint64_t zfs_arc_max;
-uint64_t zfs_arc_min;
-uint64_t zfs_arc_meta_limit = 0;
+unsigned long zfs_arc_max = 0;
+unsigned long zfs_arc_min = 0;
+unsigned long zfs_arc_meta_limit = 0;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
+int zfs_arc_reduce_dnlc_percent = 0;
/*
* Note that buffers can be in one of 6 states:
kstat_named_t arcstat_l2_size;
kstat_named_t arcstat_l2_hdr_size;
kstat_named_t arcstat_memory_throttle_count;
+ kstat_named_t arcstat_memory_direct_count;
+ kstat_named_t arcstat_memory_indirect_count;
+ kstat_named_t arcstat_no_grow;
+ kstat_named_t arcstat_tempreserve;
+ kstat_named_t arcstat_loaned_bytes;
+ kstat_named_t arcstat_meta_used;
+ kstat_named_t arcstat_meta_limit;
+ kstat_named_t arcstat_meta_max;
} arc_stats_t;
static arc_stats_t arc_stats = {
{ "l2_io_error", KSTAT_DATA_UINT64 },
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
- { "memory_throttle_count", KSTAT_DATA_UINT64 }
+ { "memory_throttle_count", KSTAT_DATA_UINT64 },
+ { "memory_direct_count", KSTAT_DATA_UINT64 },
+ { "memory_indirect_count", KSTAT_DATA_UINT64 },
+ { "arc_no_grow", KSTAT_DATA_UINT64 },
+ { "arc_tempreserve", KSTAT_DATA_UINT64 },
+ { "arc_loaned_bytes", KSTAT_DATA_UINT64 },
+ { "arc_meta_used", KSTAT_DATA_UINT64 },
+ { "arc_meta_limit", KSTAT_DATA_UINT64 },
+ { "arc_meta_max", KSTAT_DATA_UINT64 },
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
-
-static int arc_no_grow; /* Don't try to grow cache size */
-static uint64_t arc_tempreserve;
-static uint64_t arc_loaned_bytes;
-static uint64_t arc_meta_used;
-static uint64_t arc_meta_limit;
-static uint64_t arc_meta_max = 0;
+#define arc_no_grow ARCSTAT(arcstat_no_grow)
+#define arc_tempreserve ARCSTAT(arcstat_tempreserve)
+#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
+#define arc_meta_used ARCSTAT(arcstat_meta_used)
+#define arc_meta_limit ARCSTAT(arcstat_meta_limit)
+#define arc_meta_max ARCSTAT(arcstat_meta_max)
typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
* Hash table routines
*/
-#define HT_LOCK_PAD 64
+#define HT_LOCK_ALIGN 64
+#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
struct ht_lock {
kmutex_t ht_lock;
#ifdef _KERNEL
- unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+ unsigned char pad[HT_LOCK_PAD];
#endif
};
/*
* L2ARC Performance Tunables
*/
-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
-uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
-boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
-boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
-boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
+unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
+unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */
+unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
+unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
+unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
+int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+int l2arc_feed_again = B_TRUE; /* turbo warmup */
+int l2arc_norw = B_TRUE; /* no reads during writes */
/*
* L2ARC Internals
{
int i;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+ /* Large allocations which do not require contiguous pages
+ * should be using vmem_free() in the linux kernel */
+ vmem_free(buf_hash_table.ht_table,
+ (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#else
kmem_free(buf_hash_table.ht_table,
(buf_hash_table.ht_mask + 1) * sizeof (void *));
+#endif
for (i = 0; i < BUF_LOCKS; i++)
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
kmem_cache_destroy(hdr_cache);
refcount_create(&buf->b_refcnt);
cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_link_init(&buf->b_arc_node);
+ list_link_init(&buf->b_l2node);
arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
return (0);
hsize <<= 1;
retry:
buf_hash_table.ht_mask = hsize - 1;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+ /* Large allocations which do not require contiguous pages
+ * should be using vmem_alloc() in the linux kernel */
+ buf_hash_table.ht_table =
+ vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+#else
buf_hash_table.ht_table =
kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+#endif
if (buf_hash_table.ht_table == NULL) {
ASSERT(hsize > (1ULL << 8));
hsize >>= 1;
ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
switch (type) {
+ default:
+ break;
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, space);
break;
ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
switch (type) {
+ default:
+ break;
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, -space);
break;
mutex_exit(&state->arcs_mtx);
if (bytes_evicted < bytes)
- dprintf("only evicted %lld bytes from %x",
+ dprintf("only evicted %lld bytes from %x\n",
(longlong_t)bytes_evicted, state);
if (skipped)
}
if (bytes_deleted < bytes)
- dprintf("only deleted %lld bytes from %p",
+ dprintf("only deleted %lld bytes from %p\n",
(longlong_t)bytes_deleted, state);
}
static int
arc_reclaim_needed(void)
{
- uint64_t extra;
-
#ifdef _KERNEL
+ uint64_t extra;
if (needfree)
return (1);
kmem_cache_t *prev_data_cache = NULL;
extern kmem_cache_t *zio_buf_cache[];
extern kmem_cache_t *zio_data_buf_cache[];
-
#ifdef _KERNEL
- if (arc_meta_used >= arc_meta_limit) {
+ int retry = 0;
+
+ while ((arc_meta_used >= arc_meta_limit) && (retry < 10)) {
/*
* We are exceeding our meta-data cache limit.
* Purge some DNLC entries to release holds on meta-data.
*/
dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
+ retry++;
}
#if defined(__i386)
/*
arc_no_grow = FALSE;
}
+ /* Keep meta data usage within limits */
+ if (arc_meta_used >= arc_meta_limit)
+ arc_kmem_reap_now(ARC_RECLAIM_CONS);
+
arc_adjust();
if (arc_eviction_list != NULL)
/* block until needed, or one second, whichever is shorter */
CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait(&arc_reclaim_thr_cv,
+ (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
&arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
}
thread_exit();
}
+#ifdef _KERNEL
+/*
+ * Under Linux the arc shrinker may be called for synchronous (direct)
+ * reclaim, or asynchronous (indirect) reclaim. When called by kswapd
+ * for indirect reclaim we take a conservative approach and just reap
+ * free slabs from the ARC caches. If this proves to be insufficient
+ * direct reclaim will be trigger. In direct reclaim a more aggressive
+ * strategy is used, data is evicted from the ARC and free slabs reaped.
+ */
+static int
+__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
+{
+ arc_reclaim_strategy_t strategy;
+ int arc_reclaim;
+
+ /* Return number of reclaimable pages based on arc_shrink_shift */
+ arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
+ >> arc_shrink_shift, 0);
+ if (sc->nr_to_scan == 0)
+ return (arc_reclaim);
+
+ /* Prevent reclaim below arc_c_min */
+ if (arc_reclaim <= 0)
+ return (-1);
+
+ /* Not allowed to perform filesystem reclaim */
+ if (!(sc->gfp_mask & __GFP_FS))
+ return (-1);
+
+ /* Reclaim in progress */
+ if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
+ return (-1);
+
+ if (current_is_kswapd()) {
+ strategy = ARC_RECLAIM_CONS;
+ ARCSTAT_INCR(arcstat_memory_indirect_count, 1);
+ } else {
+ strategy = ARC_RECLAIM_AGGR;
+ ARCSTAT_INCR(arcstat_memory_direct_count, 1);
+ }
+
+ arc_kmem_reap_now(strategy);
+ arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
+ >> arc_shrink_shift, 0);
+ mutex_exit(&arc_reclaim_thr_lock);
+
+ return (arc_reclaim);
+}
+SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
+
+SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
+#endif /* _KERNEL */
+
/*
* Adapt arc info given the number of bytes we are trying to add and
* the state that we are comming from. This function is only called
uint32_t *arc_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
- arc_buf_t *buf;
+ arc_buf_t *buf = NULL;
kmutex_t *hash_lock;
zio_t *rzio;
uint64_t guid = spa_guid(spa);
arc_callback_t *acb = NULL;
acb = kmem_zalloc(sizeof (arc_callback_t),
- KM_SLEEP);
+ KM_PUSHPAGE);
acb->acb_done = done;
acb->acb_private = private;
if (pio != NULL)
uint64_t size = BP_GET_LSIZE(bp);
arc_callback_t *acb;
vdev_t *vd = NULL;
- uint64_t addr;
+ uint64_t addr = -1;
boolean_t devw = B_FALSE;
if (hdr == NULL) {
ASSERT(!GHOST_STATE(hdr->b_state));
- acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+ acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE);
acb->acb_done = done;
acb->acb_private = private;
ARCSTAT_BUMP(arcstat_l2_hits);
cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
- KM_SLEEP);
+ KM_PUSHPAGE);
cb->l2rcb_buf = buf;
cb->l2rcb_spa = spa;
cb->l2rcb_bp = *bp;
arc_buf_hdr_t *hdr;
kmutex_t *hash_lock = NULL;
l2arc_buf_hdr_t *l2hdr;
- uint64_t buf_size;
+ uint64_t buf_size = 0;
/*
* It would be nice to assert that if it's DMU metadata (level >
* in order to compress/encrypt/etc the data. We therefor need to
* make sure that there is sufficient available memory for this.
*/
- if (error = arc_memory_throttle(reserve, anon_size, txg))
+ if ((error = arc_memory_throttle(reserve, anon_size, txg)))
return (error);
/*
* need to limit the cache to 1/8 of VM size.
*/
arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+ /*
+ * Register a shrinker to support synchronous (direct) memory
+ * reclaim from the arc. This is done to prevent kswapd from
+ * swapping out pages when it is preferable to shrink the arc.
+ */
+ spl_register_shrinker(&arc_shrinker);
#endif
/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
/* limit meta-data to 1/4 of the arc capacity */
arc_meta_limit = arc_c_max / 4;
+ arc_meta_max = 0;
/* Allow the tunable to override if it is reasonable */
if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
if (zfs_arc_p_min_shift > 0)
arc_p_min_shift = zfs_arc_p_min_shift;
+ if (zfs_arc_reduce_dnlc_percent > 0)
+ arc_reduce_dnlc_percent = zfs_arc_reduce_dnlc_percent;
+
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
arc_c = arc_c / 2;
arc_fini(void)
{
mutex_enter(&arc_reclaim_thr_lock);
+#ifdef _KERNEL
+ spl_unregister_shrinker(&arc_shrinker);
+#endif /* _KERNEL */
+
arc_thread_exit = 1;
while (arc_thread_exit != 0)
cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
static list_t *
l2arc_list_locked(int list_num, kmutex_t **lock)
{
- list_t *list;
+ list_t *list = NULL;
ASSERT(list_num >= 0 && list_num <= 3);
list_t *list;
uint64_t passed_sz, write_sz, buf_sz, headroom;
void *buf_data;
- kmutex_t *hash_lock, *list_lock;
+ kmutex_t *hash_lock, *list_lock = NULL;
boolean_t have_lock, full;
l2arc_write_callback_t *cb;
zio_t *pio, *wzio;
while (l2arc_thread_exit == 0) {
CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
- next);
+ (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
+ &l2arc_feed_thr_lock, next);
CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
next = ddi_get_lbolt() + hz;
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
+ list_link_init(&adddev->l2ad_node);
ASSERT3U(adddev->l2ad_write, >, 0);
/*
cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
mutex_exit(&l2arc_feed_thr_lock);
}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(arc_read);
+EXPORT_SYMBOL(arc_buf_remove_ref);
+EXPORT_SYMBOL(arc_getbuf_func);
+
+module_param(zfs_arc_min, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
+
+module_param(zfs_arc_max, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
+
+module_param(zfs_arc_meta_limit, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
+
+module_param(zfs_arc_reduce_dnlc_percent, int, 0444);
+MODULE_PARM_DESC(zfs_arc_reduce_dnlc_percent, "Meta reclaim percentage");
+
+module_param(zfs_arc_grow_retry, int, 0444);
+MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
+
+module_param(zfs_arc_shrink_shift, int, 0444);
+MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
+
+module_param(zfs_arc_p_min_shift, int, 0444);
+MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
+
+module_param(l2arc_write_max, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
+
+module_param(l2arc_write_boost, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
+
+module_param(l2arc_headroom, ulong, 0444);
+MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
+
+module_param(l2arc_feed_secs, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
+
+module_param(l2arc_feed_min_ms, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
+
+module_param(l2arc_noprefetch, int, 0444);
+MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
+
+module_param(l2arc_feed_again, int, 0444);
+MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
+
+module_param(l2arc_norw, int, 0444);
+MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
+
+#endif