X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Farc.c;h=ff631e61b2eaa2eaf04d2ad1712f6f17db554060;hb=5547c2f1bf49802835fd6c52f15115ba344a2a8b;hp=8adb54dc6e195748d56578f3aa1e8eb213a42f86;hpb=428870ff734fdaccc342b33fc53cf94724409a46;p=zfs.git

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 8adb54d..ff631e6 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -174,12 +174,13 @@ static boolean_t arc_warm;
 /*
  * These tunables are for performance analysis.
  */
-uint64_t zfs_arc_max;
-uint64_t zfs_arc_min;
-uint64_t zfs_arc_meta_limit = 0;
+unsigned long zfs_arc_max = 0;
+unsigned long zfs_arc_min = 0;
+unsigned long zfs_arc_meta_limit = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
+int zfs_arc_reduce_dnlc_percent = 0;
 
 /*
  * Note that buffers can be in one of 6 states:
@@ -282,6 +283,14 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_l2_size;
 	kstat_named_t arcstat_l2_hdr_size;
 	kstat_named_t arcstat_memory_throttle_count;
+	kstat_named_t arcstat_memory_direct_count;
+	kstat_named_t arcstat_memory_indirect_count;
+	kstat_named_t arcstat_no_grow;
+	kstat_named_t arcstat_tempreserve;
+	kstat_named_t arcstat_loaned_bytes;
+	kstat_named_t arcstat_meta_used;
+	kstat_named_t arcstat_meta_limit;
+	kstat_named_t arcstat_meta_max;
 } arc_stats_t;
 
 static arc_stats_t arc_stats = {
@@ -337,7 +346,15 @@ static arc_stats_t arc_stats = {
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
-	{ "memory_throttle_count",	KSTAT_DATA_UINT64 }
+	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
+	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
+	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
+	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
+	{ "arc_tempreserve",		KSTAT_DATA_UINT64 },
+	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
 };
 
 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
@@ -399,13 +416,12 @@ static arc_state_t	*arc_l2c_only;
 #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
-
-static int		arc_no_grow;	/* Don't try to grow cache size */
-static uint64_t		arc_tempreserve;
-static uint64_t		arc_loaned_bytes;
-static uint64_t		arc_meta_used;
-static uint64_t		arc_meta_limit;
-static uint64_t		arc_meta_max = 0;
+#define	arc_no_grow	ARCSTAT(arcstat_no_grow)
+#define	arc_tempreserve	ARCSTAT(arcstat_tempreserve)
+#define	arc_loaned_bytes	ARCSTAT(arcstat_loaned_bytes)
+#define	arc_meta_used	ARCSTAT(arcstat_meta_used)
+#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit)
+#define	arc_meta_max	ARCSTAT(arcstat_meta_max)
 
 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 
@@ -523,12 +539,13 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
  * Hash table routines
  */
 
-#define	HT_LOCK_PAD	64
+#define	HT_LOCK_ALIGN	64
+#define	HT_LOCK_PAD	(P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
 
 struct ht_lock {
 	kmutex_t	ht_lock;
 #ifdef _KERNEL
-	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+	unsigned char	pad[HT_LOCK_PAD];
 #endif
 };
 
@@ -565,14 +582,14 @@ uint64_t zfs_crc64_table[256];
 /*
  * L2ARC Performance Tunables
  */
-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
-uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
-boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
-boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
-boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
+unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
+unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
+unsigned long l2arc_headroom = L2ARC_HEADROOM;		/* # of dev writes */
+unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
+unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
+int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
+int l2arc_feed_again = B_TRUE;			/* turbo warmup */
+int l2arc_norw = B_TRUE;			/* no reads during writes */
 
 /*
  * L2ARC Internals
@@ -772,8 +789,15 @@ buf_fini(void)
 {
 	int i;
 
+#if defined(_KERNEL) && defined(HAVE_SPL)
+	/* Large allocations which do not require contiguous pages
+	 * should be using vmem_free() in the linux kernel */
+	vmem_free(buf_hash_table.ht_table,
+	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#else
 	kmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#endif
 	for (i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 	kmem_cache_destroy(hdr_cache);
@@ -794,6 +818,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag)
 	refcount_create(&buf->b_refcnt);
 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_link_init(&buf->b_arc_node);
+	list_link_init(&buf->b_l2node);
 	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 
 	return (0);
@@ -873,8 +899,15 @@ buf_init(void)
 		hsize <<= 1;
 retry:
 	buf_hash_table.ht_mask = hsize - 1;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+	/* Large allocations which do not require contiguous pages
+	 * should be using vmem_alloc() in the linux kernel */
+	buf_hash_table.ht_table =
+	    vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+#else
 	buf_hash_table.ht_table =
 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+#endif
 	if (buf_hash_table.ht_table == NULL) {
 		ASSERT(hsize > (1ULL << 8));
 		hsize >>= 1;
@@ -952,11 +985,6 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
-	kmutex_t *hash_lock;
-
-	hash_lock = HDR_LOCK(buf->b_hdr);
-	mutex_enter(hash_lock);
-
 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
 		if (buf->b_hdr->b_state != arc_anon)
 			panic("modifying non-anon buffer!");
@@ -978,7 +1006,6 @@ arc_buf_thaw(arc_buf_t *buf)
 	}
 
 	mutex_exit(&buf->b_hdr->b_freeze_lock);
-	mutex_exit(hash_lock);
 }
 
 void
@@ -1149,6 +1176,8 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
+	default:
+		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, space);
 		break;
@@ -1173,6 +1202,8 @@ arc_space_return(uint64_t space, arc_space_type_t type)
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
+	default:
+		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, -space);
 		break;
@@ -1432,10 +1463,11 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
+	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+
 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
 	ASSERT3P(hdr->b_state, ==, arc_anon);
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
 
 	if (l2hdr != NULL) {
 		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
@@ -1709,7 +1741,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
 	mutex_exit(&state->arcs_mtx);
 
 	if (bytes_evicted < bytes)
-		dprintf("only evicted %lld bytes from %x",
+		dprintf("only evicted %lld bytes from %x\n",
 		    (longlong_t)bytes_evicted, state);
 
 	if (skipped)
@@ -1730,12 +1762,12 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
 		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
 			int64_t todelete =
 			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-			arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+			arc_evict_ghost(arc_mru_ghost, 0, todelete);
 		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
 			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
 			    arc_mru_ghost->arcs_size +
 			    arc_mfu_ghost->arcs_size - arc_c);
-			arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+			arc_evict_ghost(arc_mfu_ghost, 0, todelete);
 		}
 	}
 
@@ -1750,18 +1782,25 @@ static void
 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
 {
 	arc_buf_hdr_t *ab, *ab_prev;
+	arc_buf_hdr_t marker;
 	list_t *list = &state->arcs_list[ARC_BUFC_DATA];
 	kmutex_t *hash_lock;
 	uint64_t bytes_deleted = 0;
 	uint64_t bufs_skipped = 0;
 
 	ASSERT(GHOST_STATE(state));
+	bzero(&marker, sizeof(marker));
 top:
 	mutex_enter(&state->arcs_mtx);
 	for (ab = list_tail(list); ab; ab = ab_prev) {
 		ab_prev = list_prev(list, ab);
 		if (spa && ab->b_spa != spa)
 			continue;
+
+		/* ignore markers */
+		if (ab->b_spa == 0)
+			continue;
+
 		hash_lock = HDR_LOCK(ab);
 		/* caller may be trying to modify this buffer, skip it */
 		if (MUTEX_HELD(hash_lock))
@@ -1788,15 +1827,21 @@ top:
 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
 			if (bytes >= 0 && bytes_deleted >= bytes)
 				break;
-		} else {
-			if (bytes < 0) {
-				mutex_exit(&state->arcs_mtx);
-				mutex_enter(hash_lock);
-				mutex_exit(hash_lock);
-				goto top;
-			}
+		} else if (bytes < 0) {
+			/*
+			 * Insert a list marker and then wait for the
+			 * hash lock to become available. Once its
+			 * available, restart from where we left off.
+			 */
+			list_insert_after(list, ab, &marker);
+			mutex_exit(&state->arcs_mtx);
+			mutex_enter(hash_lock);
+			mutex_exit(hash_lock);
+			mutex_enter(&state->arcs_mtx);
+			ab_prev = list_prev(list, &marker);
+			list_remove(list, &marker);
+		} else
 			bufs_skipped += 1;
-		}
 	}
 	mutex_exit(&state->arcs_mtx);
 
@@ -1812,7 +1857,7 @@ top:
 	}
 
 	if (bytes_deleted < bytes)
-		dprintf("only deleted %lld bytes from %p",
+		dprintf("only deleted %lld bytes from %p\n",
 		    (longlong_t)bytes_deleted, state);
 }
 
@@ -1825,18 +1870,19 @@ arc_adjust(void)
 	 * Adjust MRU size
 	 */
 
-	adjustment = MIN(arc_size - arc_c,
-	    arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
+	adjustment = MIN((int64_t)(arc_size - arc_c),
+	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
+	    arc_p));
 
 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
-		(void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
+		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
 		adjustment -= delta;
 	}
 
 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
-		(void) arc_evict(arc_mru, NULL, delta, FALSE,
+		(void) arc_evict(arc_mru, 0, delta, FALSE,
 		    ARC_BUFC_METADATA);
 	}
 
@@ -1848,14 +1894,14 @@ arc_adjust(void)
 
 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
 		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
-		(void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
+		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
 		adjustment -= delta;
 	}
 
 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
 		int64_t delta = MIN(adjustment,
 		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
-		(void) arc_evict(arc_mfu, NULL, delta, FALSE,
+		(void) arc_evict(arc_mfu, 0, delta, FALSE,
 		    ARC_BUFC_METADATA);
 	}
 
@@ -1867,7 +1913,7 @@ arc_adjust(void)
 
 	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
 		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mru_ghost, NULL, delta);
+		arc_evict_ghost(arc_mru_ghost, 0, delta);
 	}
 
 	adjustment =
@@ -1875,7 +1921,7 @@ arc_adjust(void)
 
 	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
 		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mfu_ghost, NULL, delta);
+		arc_evict_ghost(arc_mfu_ghost, 0, delta);
 	}
 }
 
@@ -1976,9 +2022,8 @@ arc_shrink(void)
 static int
 arc_reclaim_needed(void)
 {
-	uint64_t extra;
-
 #ifdef _KERNEL
+	uint64_t extra;
 
 	if (needfree)
 		return (1);
@@ -2040,14 +2085,16 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 	kmem_cache_t		*prev_data_cache = NULL;
 	extern kmem_cache_t	*zio_buf_cache[];
 	extern kmem_cache_t	*zio_data_buf_cache[];
-
 #ifdef _KERNEL
-	if (arc_meta_used >= arc_meta_limit) {
+	int			retry = 0;
+
+	while ((arc_meta_used >= arc_meta_limit) && (retry < 10)) {
 		/*
 		 * We are exceeding our meta-data cache limit.
 		 * Purge some DNLC entries to release holds on meta-data.
 		 */
 		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
+		retry++;
 	}
 #if defined(__i386)
 	/*
@@ -2113,16 +2160,18 @@ arc_reclaim_thread(void)
 			arc_no_grow = FALSE;
 		}
 
-		if (2 * arc_c < arc_size +
-		    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
-			arc_adjust();
+		/* Keep meta data usage within limits */
+		if (arc_meta_used >= arc_meta_limit)
+			arc_kmem_reap_now(ARC_RECLAIM_CONS);
+
+		arc_adjust();
 
 		if (arc_eviction_list != NULL)
 			arc_do_user_evicts();
 
 		/* block until needed, or one second, whichever is shorter */
 		CALLB_CPR_SAFE_BEGIN(&cpr);
-		(void) cv_timedwait(&arc_reclaim_thr_cv,
+		(void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
 		    &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
 	}
@@ -2133,6 +2182,59 @@ arc_reclaim_thread(void)
 	thread_exit();
 }
 
+#ifdef _KERNEL
+/*
+ * Under Linux the arc shrinker may be called for synchronous (direct)
+ * reclaim, or asynchronous (indirect) reclaim.  When called by kswapd
+ * for indirect reclaim we take a conservative approach and just reap
+ * free slabs from the ARC caches.  If this proves to be insufficient
+ * direct reclaim will be trigger.  In direct reclaim a more aggressive
+ * strategy is used, data is evicted from the ARC and free slabs reaped.
+ */
+static int
+__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
+{
+	arc_reclaim_strategy_t strategy;
+	int arc_reclaim;
+
+	/* Return number of reclaimable pages based on arc_shrink_shift */
+	arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
+	    >> arc_shrink_shift, 0);
+	if (sc->nr_to_scan == 0)
+		return (arc_reclaim);
+
+	/* Prevent reclaim below arc_c_min */
+	if (arc_reclaim <= 0)
+		return (-1);
+
+	/* Not allowed to perform filesystem reclaim */
+	if (!(sc->gfp_mask & __GFP_FS))
+		return (-1);
+
+	/* Reclaim in progress */
+	if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
+		return (-1);
+
+	if (current_is_kswapd()) {
+		strategy = ARC_RECLAIM_CONS;
+		ARCSTAT_INCR(arcstat_memory_indirect_count, 1);
+	} else {
+		strategy = ARC_RECLAIM_AGGR;
+		ARCSTAT_INCR(arcstat_memory_direct_count, 1);
+	}
+
+	arc_kmem_reap_now(strategy);
+	arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
+	    >> arc_shrink_shift, 0);
+	mutex_exit(&arc_reclaim_thr_lock);
+
+	return (arc_reclaim);
+}
+SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
+
+SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
+#endif /* _KERNEL */
+
 /*
  * Adapt arc info given the number of bytes we are trying to add and
  * the state that we are comming from.  This function is only called
@@ -2159,6 +2261,7 @@ arc_adapt(int bytes, arc_state_t *state)
 	if (state == arc_mru_ghost) {
 		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
 		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
 
 		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
 	} else if (state == arc_mfu_ghost) {
@@ -2166,6 +2269,7 @@ arc_adapt(int bytes, arc_state_t *state)
 
 		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
 		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+		mult = MIN(mult, 10);
 
 		delta = MIN(bytes * mult, arc_p);
 		arc_p = MAX(arc_p_min, arc_p - delta);
@@ -2295,7 +2399,7 @@ arc_get_data_buf(arc_buf_t *buf)
 		state =  (arc_mru->arcs_lsize[type] >= size &&
 		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
 	}
-	if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
+	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
 		if (type == ARC_BUFC_METADATA) {
 			buf->b_data = zio_buf_alloc(size);
 			arc_space_consume(size, ARC_SPACE_DATA);
@@ -2666,7 +2770,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     uint32_t *arc_flags, const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr;
-	arc_buf_t *buf;
+	arc_buf_t *buf = NULL;
 	kmutex_t *hash_lock;
 	zio_t *rzio;
 	uint64_t guid = spa_guid(spa);
@@ -2691,7 +2795,7 @@ top:
 				arc_callback_t	*acb = NULL;
 
 				acb = kmem_zalloc(sizeof (arc_callback_t),
-				    KM_SLEEP);
+				    KM_PUSHPAGE);
 				acb->acb_done = done;
 				acb->acb_private = private;
 				if (pio != NULL)
@@ -2748,7 +2852,7 @@ top:
 		uint64_t size = BP_GET_LSIZE(bp);
 		arc_callback_t	*acb;
 		vdev_t *vd = NULL;
-		uint64_t addr;
+		uint64_t addr = -1;
 		boolean_t devw = B_FALSE;
 
 		if (hdr == NULL) {
@@ -2807,7 +2911,7 @@ top:
 
 		ASSERT(!GHOST_STATE(hdr->b_state));
 
-		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+		acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE);
 		acb->acb_done = done;
 		acb->acb_private = private;
 
@@ -2856,7 +2960,7 @@ top:
 				ARCSTAT_BUMP(arcstat_l2_hits);
 
 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
-				    KM_SLEEP);
+				    KM_PUSHPAGE);
 				cb->l2rcb_buf = buf;
 				cb->l2rcb_spa = spa;
 				cb->l2rcb_bp = *bp;
@@ -3026,7 +3130,7 @@ arc_release(arc_buf_t *buf, void *tag)
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock = NULL;
 	l2arc_buf_hdr_t *l2hdr;
-	uint64_t buf_size;
+	uint64_t buf_size = 0;
 
 	/*
 	 * It would be nice to assert that if it's DMU metadata (level >
@@ -3401,7 +3505,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 	 * in order to compress/encrypt/etc the data.  We therefor need to
 	 * make sure that there is sufficient available memory for this.
 	 */
-	if (error = arc_memory_throttle(reserve, anon_size, txg))
+	if ((error = arc_memory_throttle(reserve, anon_size, txg)))
 		return (error);
 
 	/*
@@ -3445,6 +3549,12 @@ arc_init(void)
 	 * need to limit the cache to 1/8 of VM size.
 	 */
 	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+	/*
+	 * Register a shrinker to support synchronous (direct) memory
+	 * reclaim from the arc.  This is done to prevent kswapd from
+	 * swapping out pages when it is preferable to shrink the arc.
+	 */
+	spl_register_shrinker(&arc_shrinker);
 #endif
 
 	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
@@ -3470,6 +3580,7 @@ arc_init(void)
 
 	/* limit meta-data to 1/4 of the arc capacity */
 	arc_meta_limit = arc_c_max / 4;
+	arc_meta_max = 0;
 
 	/* Allow the tunable to override if it is reasonable */
 	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
@@ -3487,6 +3598,9 @@ arc_init(void)
 	if (zfs_arc_p_min_shift > 0)
 		arc_p_min_shift = zfs_arc_p_min_shift;
 
+	if (zfs_arc_reduce_dnlc_percent > 0)
+		arc_reduce_dnlc_percent = zfs_arc_reduce_dnlc_percent;
+
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
@@ -3561,6 +3675,10 @@ void
 arc_fini(void)
 {
 	mutex_enter(&arc_reclaim_thr_lock);
+#ifdef _KERNEL
+	spl_unregister_shrinker(&arc_shrinker);
+#endif /* _KERNEL */
+
 	arc_thread_exit = 1;
 	while (arc_thread_exit != 0)
 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
@@ -3870,7 +3988,7 @@ out:
  * Free buffers that were tagged for destruction.
  */
 static void
-l2arc_do_free_on_write()
+l2arc_do_free_on_write(void)
 {
 	list_t *buflist;
 	l2arc_data_free_t *df, *df_prev;
@@ -4050,7 +4168,7 @@ l2arc_read_done(zio_t *zio)
 static list_t *
 l2arc_list_locked(int list_num, kmutex_t **lock)
 {
-	list_t *list;
+	list_t *list = NULL;
 
 	ASSERT(list_num >= 0 && list_num <= 3);
 
@@ -4223,11 +4341,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	list_t *list;
 	uint64_t passed_sz, write_sz, buf_sz, headroom;
 	void *buf_data;
-	kmutex_t *hash_lock, *list_lock;
+	kmutex_t *hash_lock, *list_lock = NULL;
 	boolean_t have_lock, full;
 	l2arc_write_callback_t *cb;
 	zio_t *pio, *wzio;
 	uint64_t guid = spa_guid(spa);
+	int try;
 
 	ASSERT(dev->l2ad_vdev != NULL);
 
@@ -4241,7 +4360,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	 * Copy buffers for L2ARC writing.
 	 */
 	mutex_enter(&l2arc_buflist_mtx);
-	for (int try = 0; try <= 3; try++) {
+	for (try = 0; try <= 3; try++) {
 		list = l2arc_list_locked(try, &list_lock);
 		passed_sz = 0;
 
@@ -4405,8 +4524,8 @@ l2arc_feed_thread(void)
 
 	while (l2arc_thread_exit == 0) {
 		CALLB_CPR_SAFE_BEGIN(&cpr);
-		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
-		    next);
+		(void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
+		    &l2arc_feed_thr_lock, next);
 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
 		next = ddi_get_lbolt() + hz;
 
@@ -4438,6 +4557,16 @@ l2arc_feed_thread(void)
 		ASSERT(spa != NULL);
 
 		/*
+		 * If the pool is read-only then force the feed thread to
+		 * sleep a little longer.
+		 */
+		if (!spa_writeable(spa)) {
+			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+			spa_config_exit(spa, SCL_L2ARC, dev);
+			continue;
+		}
+
+		/*
 		 * Avoid contributing to memory pressure.
 		 */
 		if (arc_reclaim_needed()) {
@@ -4514,6 +4643,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
+	list_link_init(&adddev->l2ad_node);
 	ASSERT3U(adddev->l2ad_write, >, 0);
 
 	/*
@@ -4639,3 +4769,55 @@ l2arc_stop(void)
 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
 	mutex_exit(&l2arc_feed_thr_lock);
 }
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(arc_read);
+EXPORT_SYMBOL(arc_buf_remove_ref);
+EXPORT_SYMBOL(arc_getbuf_func);
+
+module_param(zfs_arc_min, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
+
+module_param(zfs_arc_max, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
+
+module_param(zfs_arc_meta_limit, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
+
+module_param(zfs_arc_reduce_dnlc_percent, int, 0444);
+MODULE_PARM_DESC(zfs_arc_reduce_dnlc_percent, "Meta reclaim percentage");
+
+module_param(zfs_arc_grow_retry, int, 0444);
+MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
+
+module_param(zfs_arc_shrink_shift, int, 0444);
+MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
+
+module_param(zfs_arc_p_min_shift, int, 0444);
+MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
+
+module_param(l2arc_write_max, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
+
+module_param(l2arc_write_boost, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
+
+module_param(l2arc_headroom, ulong, 0444);
+MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
+
+module_param(l2arc_feed_secs, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
+
+module_param(l2arc_feed_min_ms, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
+
+module_param(l2arc_noprefetch, int, 0444);
+MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
+
+module_param(l2arc_feed_again, int, 0444);
+MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
+
+module_param(l2arc_norw, int, 0444);
+MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
+
+#endif