X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Farc.c;h=ce4a0239c0c0ba18370fd9ea43cb2b176fb90e05;hb=68121a03daf58a7d5b9351f110196b8ce806e1fa;hp=cbc27d3687a936ffdc7e4bd3aa23fd68db862e8f;hpb=409dc1a570a836737b2a5bb43658cdde703c935e;p=zfs.git

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index cbc27d3..ce4a023 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -20,6 +20,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 /*
@@ -78,9 +81,9 @@
  * types of locks: 1) the hash table lock array, and 2) the
  * arc list locks.
  *
- * Buffers do not have their own mutexs, rather they rely on the
- * hash table mutexs for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexs).
+ * Buffers do not have their own mutexes, rather they rely on the
+ * hash table mutexes for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexes).
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
@@ -126,6 +129,7 @@
 
 #include <sys/spa.h>
 #include <sys/zio.h>
+#include <sys/zio_compress.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/vdev.h>
@@ -145,12 +149,8 @@ static kmutex_t		arc_reclaim_thr_lock;
 static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
 static uint8_t		arc_thread_exit;
 
-extern int zfs_write_limit_shift;
-extern uint64_t zfs_write_limit_max;
-extern kmutex_t zfs_write_limit_lock;
-
 /* number of bytes to prune from caches when at arc_meta_limit is reached */
-uint_t arc_meta_prune = 1048576;
+int zfs_arc_meta_prune = 1048576;
 
 typedef enum arc_reclaim_strategy {
 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
@@ -158,22 +158,31 @@ typedef enum arc_reclaim_strategy {
 } arc_reclaim_strategy_t;
 
 /* number of seconds before growing cache again */
-static int		arc_grow_retry = 60;
+int zfs_arc_grow_retry = 5;
 
 /* shift of arc_c for calculating both min and max arc_p */
-static int		arc_p_min_shift = 4;
+int zfs_arc_p_min_shift = 4;
 
 /* log2(fraction of arc to reclaim) */
-static int		arc_shrink_shift = 5;
+int zfs_arc_shrink_shift = 5;
 
 /*
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
-static int		arc_min_prefetch_lifespan;
+int zfs_arc_min_prefetch_lifespan = HZ;
+
+/* disable arc proactive arc throttle due to low memory */
+int zfs_arc_memory_throttle_disable = 1;
+
+/* disable duplicate buffer eviction */
+int zfs_disable_dup_eviction = 0;
 
 static int arc_dead;
 
+/* expiration time for arc_no_grow */
+static clock_t arc_grow_time = 0;
+
 /*
  * The arc has filled available memory and has now warmed up.
  */
@@ -185,10 +194,6 @@ static boolean_t arc_warm;
 unsigned long zfs_arc_max = 0;
 unsigned long zfs_arc_min = 0;
 unsigned long zfs_arc_meta_limit = 0;
-int zfs_arc_grow_retry = 0;
-int zfs_arc_shrink_shift = 0;
-int zfs_arc_p_min_shift = 0;
-int zfs_arc_meta_prune = 0;
 
 /*
  * Note that buffers can be in one of 6 states:
@@ -304,8 +309,15 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_l2_cksum_bad;
 	kstat_named_t arcstat_l2_io_error;
 	kstat_named_t arcstat_l2_size;
+	kstat_named_t arcstat_l2_asize;
 	kstat_named_t arcstat_l2_hdr_size;
+	kstat_named_t arcstat_l2_compress_successes;
+	kstat_named_t arcstat_l2_compress_zeros;
+	kstat_named_t arcstat_l2_compress_failures;
 	kstat_named_t arcstat_memory_throttle_count;
+	kstat_named_t arcstat_duplicate_buffers;
+	kstat_named_t arcstat_duplicate_buffers_size;
+	kstat_named_t arcstat_duplicate_reads;
 	kstat_named_t arcstat_memory_direct_count;
 	kstat_named_t arcstat_memory_indirect_count;
 	kstat_named_t arcstat_no_grow;
@@ -384,8 +396,15 @@ static arc_stats_t arc_stats = {
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
+	{ "l2_asize",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
+	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
+	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
+	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
+	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
+	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
+	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
 	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
 	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
 	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
@@ -463,6 +482,9 @@ static arc_state_t	*arc_l2c_only;
 #define	arc_meta_limit	ARCSTAT(arcstat_meta_limit)
 #define	arc_meta_max	ARCSTAT(arcstat_meta_max)
 
+#define	L2ARC_IS_VALID_COMPRESS(_c_) \
+	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
+
 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 
 typedef struct arc_callback arc_callback_t;
@@ -492,7 +514,6 @@ struct arc_buf_hdr {
 
 	kmutex_t		b_freeze_lock;
 	zio_cksum_t		*b_freeze_cksum;
-	void			*b_thawed;
 
 	arc_buf_hdr_t		*b_hash_next;
 	arc_buf_t		*b_buf;
@@ -529,7 +550,8 @@ static arc_buf_hdr_t arc_eviction_hdr;
 static void arc_get_data_buf(arc_buf_t *buf);
 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 static int arc_evict_needed(arc_buf_contents_t type);
-static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
+static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type);
 
 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 
@@ -614,7 +636,12 @@ uint64_t zfs_crc64_table[256];
  */
 
 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
-#define	L2ARC_HEADROOM		2		/* num of writes */
+#define	L2ARC_HEADROOM		2			/* num of writes */
+/*
+ * If we discover during ARC scan any buffers to be compressed, we boost
+ * our headroom for the next scanning cycle by this percentage multiple.
+ */
+#define	L2ARC_HEADROOM_BOOST	200
 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
@@ -627,11 +654,13 @@ uint64_t zfs_crc64_table[256];
 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
 unsigned long l2arc_headroom = L2ARC_HEADROOM;		/* # of dev writes */
+unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
 int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
+int l2arc_nocompress = B_FALSE;			/* don't compress bufs */
 int l2arc_feed_again = B_TRUE;			/* turbo warmup */
-int l2arc_norw = B_TRUE;			/* no reads during writes */
+int l2arc_norw = B_FALSE;			/* no reads during writes */
 
 /*
  * L2ARC Internals
@@ -640,8 +669,6 @@ typedef struct l2arc_dev {
 	vdev_t			*l2ad_vdev;	/* vdev */
 	spa_t			*l2ad_spa;	/* spa */
 	uint64_t		l2ad_hand;	/* next write location */
-	uint64_t		l2ad_write;	/* desired write size, bytes */
-	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
 	uint64_t		l2ad_start;	/* first addr on device */
 	uint64_t		l2ad_end;	/* last addr on device */
 	uint64_t		l2ad_evict;	/* last addr eviction reached */
@@ -662,11 +689,12 @@ static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
 static uint64_t l2arc_ndev;			/* number of devices */
 
 typedef struct l2arc_read_callback {
-	arc_buf_t	*l2rcb_buf;		/* read buffer */
-	spa_t		*l2rcb_spa;		/* spa */
-	blkptr_t	l2rcb_bp;		/* original blkptr */
-	zbookmark_t	l2rcb_zb;		/* original bookmark */
-	int		l2rcb_flags;		/* original flags */
+	arc_buf_t		*l2rcb_buf;		/* read buffer */
+	spa_t			*l2rcb_spa;		/* spa */
+	blkptr_t		l2rcb_bp;		/* original blkptr */
+	zbookmark_t		l2rcb_zb;		/* original bookmark */
+	int			l2rcb_flags;		/* original flags */
+	enum zio_compress	l2rcb_compress;		/* applied compress */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_write_callback {
@@ -676,8 +704,14 @@ typedef struct l2arc_write_callback {
 
 struct l2arc_buf_hdr {
 	/* protected by arc_buf_hdr  mutex */
-	l2arc_dev_t	*b_dev;			/* L2ARC device */
-	uint64_t	b_daddr;		/* disk address, offset byte */
+	l2arc_dev_t		*b_dev;		/* L2ARC device */
+	uint64_t		b_daddr;	/* disk address, offset byte */
+	/* compression applied to buffer data */
+	enum zio_compress	b_compress;
+	/* real alloc'd buffer size depending on b_compress applied */
+	int			b_asize;
+	/* temporary buffer holder for in-flight compressed data */
+	void			*b_tmp_cdata;
 };
 
 typedef struct l2arc_data_free {
@@ -696,6 +730,11 @@ static void l2arc_read_done(zio_t *zio);
 static void l2arc_hdr_stat_add(void);
 static void l2arc_hdr_stat_remove(void);
 
+static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
+static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
+    enum zio_compress c);
+static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
+
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
@@ -875,7 +914,6 @@ buf_cons(void *vbuf, void *unused, int kmflag)
 
 	bzero(buf, sizeof (arc_buf_t));
 	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
-	rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
 	return (0);
@@ -905,25 +943,9 @@ buf_dest(void *vbuf, void *unused)
 	arc_buf_t *buf = vbuf;
 
 	mutex_destroy(&buf->b_evict_lock);
-	rw_destroy(&buf->b_data_lock);
 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
-/*
- * Reclaim callback -- invoked when memory is low.
- */
-/* ARGSUSED */
-static void
-hdr_recl(void *unused)
-{
-	/*
-	 * umem calls the reclaim func when we destroy the buf cache,
-	 * which is after we do arc_fini().
-	 */
-	if (!arc_dead)
-		cv_signal(&arc_reclaim_thr_cv);
-}
-
 static void
 buf_init(void)
 {
@@ -956,7 +978,7 @@ retry:
 	}
 
 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
-	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+	    0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0);
 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 
@@ -1041,12 +1063,6 @@ arc_buf_thaw(arc_buf_t *buf)
 		buf->b_hdr->b_freeze_cksum = NULL;
 	}
 
-	if (zfs_flags & ZFS_DEBUG_MODIFY) {
-		if (buf->b_hdr->b_thawed)
-			kmem_free(buf->b_hdr->b_thawed, 1);
-		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
-	}
-
 	mutex_exit(&buf->b_hdr->b_freeze_lock);
 }
 
@@ -1083,7 +1099,7 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 		ASSERT(list_link_active(&ab->b_arc_node));
 		list_remove(list, ab);
 		if (GHOST_STATE(ab->b_state)) {
-			ASSERT3U(ab->b_datacnt, ==, 0);
+			ASSERT0(ab->b_datacnt);
 			ASSERT3P(ab->b_buf, ==, NULL);
 			delta = ab->b_size;
 		}
@@ -1268,23 +1284,6 @@ arc_space_return(uint64_t space, arc_space_type_t type)
 	atomic_add_64(&arc_size, -space);
 }
 
-void *
-arc_data_buf_alloc(uint64_t size)
-{
-	if (arc_evict_needed(ARC_BUFC_DATA))
-		cv_signal(&arc_reclaim_thr_cv);
-	atomic_add_64(&arc_size, size);
-	return (zio_data_buf_alloc(size));
-}
-
-void
-arc_data_buf_free(void *buf, uint64_t size)
-{
-	zio_data_buf_free(buf, size);
-	ASSERT(arc_size >= size);
-	atomic_add_64(&arc_size, -size);
-}
-
 arc_buf_t *
 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
 {
@@ -1296,7 +1295,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
 	ASSERT(BUF_EMPTY(hdr));
 	hdr->b_size = size;
 	hdr->b_type = type;
-	hdr->b_spa = spa_guid(spa);
+	hdr->b_spa = spa_load_guid(spa);
 	hdr->b_state = arc_anon;
 	hdr->b_arc_access = 0;
 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
@@ -1383,6 +1382,17 @@ arc_buf_clone(arc_buf_t *from)
 	hdr->b_buf = buf;
 	arc_get_data_buf(buf);
 	bcopy(from->b_data, buf->b_data, size);
+
+	/*
+	 * This buffer already exists in the arc so create a duplicate
+	 * copy for the caller.  If the buffer is associated with user data
+	 * then track the size and number of duplicates.  These stats will be
+	 * updated as duplicate buffers are created and destroyed.
+	 */
+	if (hdr->b_type == ARC_BUFC_DATA) {
+		ARCSTAT_BUMP(arcstat_duplicate_buffers);
+		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
+	}
 	hdr->b_datacnt += 1;
 	return (buf);
 }
@@ -1430,7 +1440,7 @@ arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
 {
 	if (HDR_L2_WRITING(hdr)) {
 		l2arc_data_free_t *df;
-		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE);
 		df->l2df_data = data;
 		df->l2df_size = size;
 		df->l2df_func = free_func;
@@ -1481,6 +1491,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 		ASSERT3U(state->arcs_size, >=, size);
 		atomic_add_64(&state->arcs_size, -size);
 		buf->b_data = NULL;
+
+		/*
+		 * If we're destroying a duplicate buffer make sure
+		 * that the appropriate statistics are updated.
+		 */
+		if (buf->b_hdr->b_datacnt > 1 &&
+		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
+			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
+		}
 		ASSERT(buf->b_hdr->b_datacnt > 0);
 		buf->b_hdr->b_datacnt -= 1;
 	}
@@ -1531,7 +1551,9 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 		if (l2hdr != NULL) {
 			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
 			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
 			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+			arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
 			if (hdr->b_state == arc_l2c_only)
 				l2arc_hdr_stat_remove();
 			hdr->b_l2hdr = NULL;
@@ -1567,10 +1589,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 		hdr->b_freeze_cksum = NULL;
 	}
-	if (hdr->b_thawed) {
-		kmem_free(hdr->b_thawed, 1);
-		hdr->b_thawed = NULL;
-	}
 
 	ASSERT(!list_link_active(&hdr->b_arc_node));
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1629,7 +1647,7 @@ int
 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
-	kmutex_t *hash_lock = HDR_LOCK(hdr);
+	kmutex_t *hash_lock = NULL;
 	int no_callback = (buf->b_efunc == NULL);
 
 	if (hdr->b_state == arc_anon) {
@@ -1638,6 +1656,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 		return (no_callback);
 	}
 
+	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 	hdr = buf->b_hdr;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
@@ -1666,6 +1685,48 @@ arc_buf_size(arc_buf_t *buf)
 }
 
 /*
+ * Called from the DMU to determine if the current buffer should be
+ * evicted. In order to ensure proper locking, the eviction must be initiated
+ * from the DMU. Return true if the buffer is associated with user data and
+ * duplicate buffers still exist.
+ */
+boolean_t
+arc_buf_eviction_needed(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr;
+	boolean_t evict_needed = B_FALSE;
+
+	if (zfs_disable_dup_eviction)
+		return (B_FALSE);
+
+	mutex_enter(&buf->b_evict_lock);
+	hdr = buf->b_hdr;
+	if (hdr == NULL) {
+		/*
+		 * We are in arc_do_user_evicts(); let that function
+		 * perform the eviction.
+		 */
+		ASSERT(buf->b_data == NULL);
+		mutex_exit(&buf->b_evict_lock);
+		return (B_FALSE);
+	} else if (buf->b_data == NULL) {
+		/*
+		 * We have already been added to the arc eviction list;
+		 * recommend eviction.
+		 */
+		ASSERT3P(hdr, ==, &arc_eviction_hdr);
+		mutex_exit(&buf->b_evict_lock);
+		return (B_TRUE);
+	}
+
+	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
+		evict_needed = B_TRUE;
+
+	mutex_exit(&buf->b_evict_lock);
+	return (evict_needed);
+}
+
+/*
  * Evict buffers from list until we've removed the specified number of
  * bytes.  Move the removed buffers to the appropriate evict state.
  * If the recycle flag is set, then attempt to "recycle" a buffer:
@@ -1704,7 +1765,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
 		    (spa && ab->b_spa != spa) ||
 		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
 		    ddi_get_lbolt() - ab->b_arc_access <
-		    arc_min_prefetch_lifespan)) {
+		    zfs_arc_min_prefetch_lifespan)) {
 			skipped++;
 			continue;
 		}
@@ -1715,7 +1776,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
 		hash_lock = HDR_LOCK(ab);
 		have_lock = MUTEX_HELD(hash_lock);
 		if (have_lock || mutex_tryenter(hash_lock)) {
-			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+			ASSERT0(refcount_count(&ab->b_refcnt));
 			ASSERT(ab->b_datacnt > 0);
 			while (ab->b_buf) {
 				arc_buf_t *buf = ab->b_buf;
@@ -1804,12 +1865,14 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
 		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
 			int64_t todelete =
 			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-			arc_evict_ghost(arc_mru_ghost, 0, todelete);
+			arc_evict_ghost(arc_mru_ghost, 0, todelete,
+			    ARC_BUFC_DATA);
 		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
 			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
 			    arc_mru_ghost->arcs_size +
 			    arc_mfu_ghost->arcs_size - arc_c);
-			arc_evict_ghost(arc_mfu_ghost, 0, todelete);
+			arc_evict_ghost(arc_mfu_ghost, 0, todelete,
+			    ARC_BUFC_DATA);
 		}
 	}
 
@@ -1821,11 +1884,12 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
  * bytes.  Destroy the buffers that are removed.
  */
 static void
-arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
 {
 	arc_buf_hdr_t *ab, *ab_prev;
 	arc_buf_hdr_t marker;
-	list_t *list = &state->arcs_list[ARC_BUFC_DATA];
+	list_t *list = &state->arcs_list[type];
 	kmutex_t *hash_lock;
 	uint64_t bytes_deleted = 0;
 	uint64_t bufs_skipped = 0;
@@ -1955,7 +2019,7 @@ arc_adjust(void)
 
 	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
 		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mru_ghost, 0, delta);
+		arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_DATA);
 	}
 
 	adjustment =
@@ -1963,7 +2027,7 @@ arc_adjust(void)
 
 	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
 		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mfu_ghost, 0, delta);
+		arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_DATA);
 	}
 }
 
@@ -2055,7 +2119,7 @@ arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
 	}
 
 	if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
-		arc_do_user_prune(arc_meta_prune);
+		arc_do_user_prune(zfs_arc_meta_prune);
 }
 
 /*
@@ -2068,7 +2132,7 @@ arc_flush(spa_t *spa)
 	uint64_t guid = 0;
 
 	if (spa)
-		guid = spa_guid(spa);
+		guid = spa_load_guid(spa);
 
 	while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
 		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
@@ -2091,8 +2155,8 @@ arc_flush(spa_t *spa)
 			break;
 	}
 
-	arc_evict_ghost(arc_mru_ghost, guid, -1);
-	arc_evict_ghost(arc_mfu_ghost, guid, -1);
+	arc_evict_ghost(arc_mru_ghost, guid, -1, ARC_BUFC_DATA);
+	arc_evict_ghost(arc_mfu_ghost, guid, -1, ARC_BUFC_DATA);
 
 	mutex_enter(&arc_reclaim_thr_lock);
 	arc_do_user_evicts();
@@ -2101,22 +2165,19 @@ arc_flush(spa_t *spa)
 }
 
 void
-arc_shrink(void)
+arc_shrink(uint64_t bytes)
 {
 	if (arc_c > arc_c_min) {
 		uint64_t to_free;
 
-#ifdef _KERNEL
-		to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
-#else
-		to_free = arc_c >> arc_shrink_shift;
-#endif
+		to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift;
+
 		if (arc_c > arc_c_min + to_free)
 			atomic_add_64(&arc_c, -to_free);
 		else
 			arc_c = arc_c_min;
 
-		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
+		atomic_add_64(&arc_p, -(arc_p >> zfs_arc_shrink_shift));
 		if (arc_c > arc_size)
 			arc_c = MAX(arc_size, arc_c_min);
 		if (arc_p > arc_c)
@@ -2129,66 +2190,8 @@ arc_shrink(void)
 		arc_adjust();
 }
 
-static int
-arc_reclaim_needed(void)
-{
-#ifdef _KERNEL
-	uint64_t extra;
-
-	if (needfree)
-		return (1);
-
-	/*
-	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
-	 */
-	extra = desfree;
-
-	/*
-	 * check that we're out of range of the pageout scanner.  It starts to
-	 * schedule paging if freemem is less than lotsfree and needfree.
-	 * lotsfree is the high-water mark for pageout, and needfree is the
-	 * number of needed free pages.  We add extra pages here to make sure
-	 * the scanner doesn't start up while we're freeing memory.
-	 */
-	if (freemem < lotsfree + needfree + extra)
-		return (1);
-
-	/*
-	 * check to make sure that swapfs has enough space so that anon
-	 * reservations can still succeed. anon_resvmem() checks that the
-	 * availrmem is greater than swapfs_minfree, and the number of reserved
-	 * swap pages.  We also add a bit of extra here just to prevent
-	 * circumstances from getting really dire.
-	 */
-	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
-		return (1);
-
-#if defined(__i386)
-	/*
-	 * If we're on an i386 platform, it's possible that we'll exhaust the
-	 * kernel heap space before we ever run out of available physical
-	 * memory.  Most checks of the size of the heap_area compare against
-	 * tune.t_minarmem, which is the minimum available real memory that we
-	 * can have in the system.  However, this is generally fixed at 25 pages
-	 * which is so low that it's useless.  In this comparison, we seek to
-	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
-	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
-	 * free)
-	 */
-	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
-	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
-		return (1);
-#endif
-
-#else
-	if (spa_get_random(100) == 0)
-		return (1);
-#endif
-	return (0);
-}
-
 static void
-arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
@@ -2201,7 +2204,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 	 * reap free buffers from the arc kmem caches.
 	 */
 	if (strat == ARC_RECLAIM_AGGR)
-		arc_shrink();
+		arc_shrink(bytes);
 
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 		if (zio_buf_cache[i] != prev_cache) {
@@ -2218,11 +2221,16 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 	kmem_cache_reap_now(hdr_cache);
 }
 
+/*
+ * Unlike other ZFS implementations this thread is only responsible for
+ * adapting the target ARC size on Linux.  The responsibility for memory
+ * reclamation has been entirely delegated to the arc_shrinker_func()
+ * which is registered with the VM.  To reflect this change in behavior
+ * the arc_reclaim thread has been renamed to arc_adapt.
+ */
 static void
-arc_reclaim_thread(void)
+arc_adapt_thread(void)
 {
-	clock_t			growtime = 0;
-	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
 	callb_cpr_t		cpr;
 	int64_t			prune;
 
@@ -2230,7 +2238,10 @@ arc_reclaim_thread(void)
 
 	mutex_enter(&arc_reclaim_thr_lock);
 	while (arc_thread_exit == 0) {
-		if (arc_reclaim_needed()) {
+#ifndef _KERNEL
+		arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
+
+		if (spa_get_random(100) == 0) {
 
 			if (arc_no_grow) {
 				if (last_reclaim == ARC_RECLAIM_CONS) {
@@ -2245,14 +2256,16 @@ arc_reclaim_thread(void)
 			}
 
 			/* reset the growth delay for every reclaim */
-			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
+			arc_grow_time = ddi_get_lbolt()+(zfs_arc_grow_retry * hz);
 
-			arc_kmem_reap_now(last_reclaim);
+			arc_kmem_reap_now(last_reclaim, 0);
 			arc_warm = B_TRUE;
+		}
+#endif /* !_KERNEL */
 
-		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
+		/* No recent memory pressure allow the ARC to grow. */
+		if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
 			arc_no_grow = FALSE;
-		}
 
 		/*
 		 * Keep meta data usage within limits, arc_shrink() is not
@@ -2273,6 +2286,26 @@ arc_reclaim_thread(void)
 		(void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
 		    &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+
+
+		/* Allow the module options to be changed */
+		if (zfs_arc_max > 64 << 20 &&
+		    zfs_arc_max < physmem * PAGESIZE &&
+		    zfs_arc_max != arc_c_max)
+			arc_c_max = zfs_arc_max;
+
+		if (zfs_arc_min > 0 &&
+		    zfs_arc_min < arc_c_max &&
+		    zfs_arc_min != arc_c_min)
+			arc_c_min = zfs_arc_min;
+
+		if (zfs_arc_meta_limit > 0 &&
+		    zfs_arc_meta_limit <= arc_c_max &&
+		    zfs_arc_meta_limit != arc_meta_limit)
+			arc_meta_limit = zfs_arc_meta_limit;
+
+
+
 	}
 
 	arc_thread_exit = 0;
@@ -2283,28 +2316,83 @@ arc_reclaim_thread(void)
 
 #ifdef _KERNEL
 /*
- * Under Linux the arc shrinker may be called for synchronous (direct)
- * reclaim, or asynchronous (indirect) reclaim.  When called by kswapd
- * for indirect reclaim we take a conservative approach and just reap
- * free slabs from the ARC caches.  If this proves to be insufficient
- * direct reclaim will be trigger.  In direct reclaim a more aggressive
- * strategy is used, data is evicted from the ARC and free slabs reaped.
+ * Determine the amount of memory eligible for eviction contained in the
+ * ARC. All clean data reported by the ghost lists can always be safely
+ * evicted. Due to arc_c_min, the same does not hold for all clean data
+ * contained by the regular mru and mfu lists.
+ *
+ * In the case of the regular mru and mfu lists, we need to report as
+ * much clean data as possible, such that evicting that same reported
+ * data will not bring arc_size below arc_c_min. Thus, in certain
+ * circumstances, the total amount of clean data in the mru and mfu
+ * lists might not actually be evictable.
+ *
+ * The following two distinct cases are accounted for:
+ *
+ * 1. The sum of the amount of dirty data contained by both the mru and
+ *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ *    is greater than or equal to arc_c_min.
+ *    (i.e. amount of dirty data >= arc_c_min)
+ *
+ *    This is the easy case; all clean data contained by the mru and mfu
+ *    lists is evictable. Evicting all clean data can only drop arc_size
+ *    to the amount of dirty data, which is greater than arc_c_min.
+ *
+ * 2. The sum of the amount of dirty data contained by both the mru and
+ *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ *    is less than arc_c_min.
+ *    (i.e. arc_c_min > amount of dirty data)
+ *
+ *    2.1. arc_size is greater than or equal arc_c_min.
+ *         (i.e. arc_size >= arc_c_min > amount of dirty data)
+ *
+ *         In this case, not all clean data from the regular mru and mfu
+ *         lists is actually evictable; we must leave enough clean data
+ *         to keep arc_size above arc_c_min. Thus, the maximum amount of
+ *         evictable data from the two lists combined, is exactly the
+ *         difference between arc_size and arc_c_min.
+ *
+ *    2.2. arc_size is less than arc_c_min
+ *         (i.e. arc_c_min > arc_size > amount of dirty data)
+ *
+ *         In this case, none of the data contained in the mru and mfu
+ *         lists is evictable, even if it's clean. Since arc_size is
+ *         already below arc_c_min, evicting any more would only
+ *         increase this negative difference.
  */
+static uint64_t
+arc_evictable_memory(void) {
+	uint64_t arc_clean =
+	    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
+	    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+	    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
+	    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
+	uint64_t ghost_clean =
+	    arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
+	    arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
+	    arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
+	    arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
+	uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
+
+	if (arc_dirty >= arc_c_min)
+		return (ghost_clean + arc_clean);
+
+	return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
+}
+
 static int
 __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
 {
-	arc_reclaim_strategy_t strategy;
-	int arc_reclaim;
+	uint64_t pages;
 
-	/* Return number of reclaimable pages based on arc_shrink_shift */
-	arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
-	    >> arc_shrink_shift, 0);
-	if (sc->nr_to_scan == 0)
-		return (arc_reclaim);
+	/* The arc is considered warm once reclaim has occurred */
+	if (unlikely(arc_warm == B_FALSE))
+		arc_warm = B_TRUE;
 
-	/* Prevent reclaim below arc_c_min */
-	if (arc_reclaim <= 0)
-		return (-1);
+	/* Return the potential number of reclaimable pages */
+	pages = btop(arc_evictable_memory());
+	if (sc->nr_to_scan == 0)
+		return (pages);
 
 	/* Not allowed to perform filesystem reclaim */
 	if (!(sc->gfp_mask & __GFP_FS))
@@ -2314,20 +2402,35 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
 	if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
 		return (-1);
 
+	/*
+	 * Evict the requested number of pages by shrinking arc_c the
+	 * requested amount.  If there is nothing left to evict just
+	 * reap whatever we can from the various arc slabs.
+	 */
+	if (pages > 0) {
+		arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
+	} else {
+		arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
+	}
+
+	/*
+	 * When direct reclaim is observed it usually indicates a rapid
+	 * increase in memory pressure.  This occurs because the kswapd
+	 * threads were unable to asynchronously keep enough free memory
+	 * available.  In this case set arc_no_grow to briefly pause arc
+	 * growth to avoid compounding the memory pressure.
+	 */
 	if (current_is_kswapd()) {
-		strategy = ARC_RECLAIM_CONS;
-		ARCSTAT_INCR(arcstat_memory_indirect_count, 1);
+		ARCSTAT_BUMP(arcstat_memory_indirect_count);
 	} else {
-		strategy = ARC_RECLAIM_AGGR;
-		ARCSTAT_INCR(arcstat_memory_direct_count, 1);
+		arc_no_grow = B_TRUE;
+		arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
+		ARCSTAT_BUMP(arcstat_memory_direct_count);
 	}
 
-	arc_kmem_reap_now(strategy);
-	arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
-	    >> arc_shrink_shift, 0);
 	mutex_exit(&arc_reclaim_thr_lock);
 
-	return (arc_reclaim);
+	return (-1);
 }
 SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
 
@@ -2343,7 +2446,7 @@ static void
 arc_adapt(int bytes, arc_state_t *state)
 {
 	int mult;
-	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+	uint64_t arc_p_min = (arc_c >> zfs_arc_p_min_shift);
 
 	if (state == arc_l2c_only)
 		return;
@@ -2375,11 +2478,6 @@ arc_adapt(int bytes, arc_state_t *state)
 	}
 	ASSERT((int64_t)arc_p >= 0);
 
-	if (arc_reclaim_needed()) {
-		cv_signal(&arc_reclaim_thr_cv);
-		return;
-	}
-
 	if (arc_no_grow)
 		return;
 
@@ -2412,19 +2510,7 @@ arc_evict_needed(arc_buf_contents_t type)
 	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
 		return (1);
 
-#ifdef _KERNEL
-	/*
-	 * If zio data pages are being allocated out of a separate heap segment,
-	 * then enforce that the size of available vmem for this area remains
-	 * above about 1/32nd free.
-	 */
-	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
-	    vmem_size(zio_arena, VMEM_FREE) <
-	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
-		return (1);
-#endif
-
-	if (arc_reclaim_needed())
+	if (arc_no_grow)
 		return (1);
 
 	return (arc_size > arc_c);
@@ -2657,7 +2743,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 			 * This is a prefetch access...
 			 * move this block back to the MRU state.
 			 */
-			ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
+			ASSERT0(refcount_count(&buf->b_refcnt));
 			new_state = arc_mru;
 		}
 
@@ -2739,10 +2825,12 @@ arc_read_done(zio_t *zio)
 	callback_list = hdr->b_acb;
 	ASSERT(callback_list != NULL);
 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
-		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
-		    byteswap_uint64_array :
-		    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
-		func(buf->b_data, hdr->b_size);
+		dmu_object_byteswap_t bswap =
+		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
+		if (BP_GET_LEVEL(zio->io_bp) > 0)
+		    byteswap_uint64_array(buf->b_data, hdr->b_size);
+		else
+		    dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
 	}
 
 	arc_cksum_compute(buf, B_FALSE);
@@ -2761,8 +2849,10 @@ arc_read_done(zio_t *zio)
 	abuf = buf;
 	for (acb = callback_list; acb; acb = acb->acb_next) {
 		if (acb->acb_done) {
-			if (abuf == NULL)
+			if (abuf == NULL) {
+				ARCSTAT_BUMP(arcstat_duplicate_reads);
 				abuf = arc_buf_clone(buf);
+			}
 			acb->acb_buf = abuf;
 			abuf = NULL;
 		}
@@ -2826,7 +2916,7 @@ arc_read_done(zio_t *zio)
 }
 
 /*
- * "Read" the block block at the specified DVA (in bp) via the
+ * "Read" the block at the specified DVA (in bp) via the
  * cache.  If the block is found in the cache, invoke the provided
  * callback immediately and return.  Note that the `zio' parameter
  * in the callback will be NULL in this case, since no IO was
@@ -2842,48 +2932,17 @@ arc_read_done(zio_t *zio)
  *
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
- *
- * Normal callers should use arc_read and pass the arc buffer and offset
- * for the bp.  But if you know you don't need locking, you can use
- * arc_read_bp.
  */
 int
-arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
-    arc_done_func_t *done, void *private, int priority, int zio_flags,
-    uint32_t *arc_flags, const zbookmark_t *zb)
-{
-	int err;
-
-	if (pbuf == NULL) {
-		/*
-		 * XXX This happens from traverse callback funcs, for
-		 * the objset_phys_t block.
-		 */
-		return (arc_read_nolock(pio, spa, bp, done, private, priority,
-		    zio_flags, arc_flags, zb));
-	}
-
-	ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
-	ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
-	rw_enter(&pbuf->b_data_lock, RW_READER);
-
-	err = arc_read_nolock(pio, spa, bp, done, private, priority,
-	    zio_flags, arc_flags, zb);
-	rw_exit(&pbuf->b_data_lock);
-
-	return (err);
-}
-
-int
-arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
-    arc_done_func_t *done, void *private, int priority, int zio_flags,
-    uint32_t *arc_flags, const zbookmark_t *zb)
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
+    void *private, int priority, int zio_flags, uint32_t *arc_flags,
+    const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf = NULL;
 	kmutex_t *hash_lock;
 	zio_t *rzio;
-	uint64_t guid = spa_guid(spa);
+	uint64_t guid = spa_load_guid(spa);
 
 top:
 	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
@@ -2950,6 +3009,8 @@ top:
 		arc_access(hdr, hash_lock);
 		if (*arc_flags & ARC_L2CACHE)
 			hdr->b_flags |= ARC_L2CACHE;
+		if (*arc_flags & ARC_L2COMPRESS)
+			hdr->b_flags |= ARC_L2COMPRESS;
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
@@ -2990,13 +3051,15 @@ top:
 			}
 			if (*arc_flags & ARC_L2CACHE)
 				hdr->b_flags |= ARC_L2CACHE;
+			if (*arc_flags & ARC_L2COMPRESS)
+				hdr->b_flags |= ARC_L2COMPRESS;
 			if (BP_GET_LEVEL(bp) > 0)
 				hdr->b_flags |= ARC_INDIRECT;
 		} else {
 			/* this block is in the ghost cache */
 			ASSERT(GHOST_STATE(hdr->b_state));
 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
+			ASSERT0(refcount_count(&hdr->b_refcnt));
 			ASSERT(hdr->b_buf == NULL);
 
 			/* if this is a prefetch, we don't have a reference */
@@ -3006,6 +3069,8 @@ top:
 				add_reference(hdr, hash_lock, private);
 			if (*arc_flags & ARC_L2CACHE)
 				hdr->b_flags |= ARC_L2CACHE;
+			if (*arc_flags & ARC_L2COMPRESS)
+				hdr->b_flags |= ARC_L2COMPRESS;
 			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 			buf->b_hdr = hdr;
 			buf->b_data = NULL;
@@ -3076,20 +3141,36 @@ top:
 				cb->l2rcb_bp = *bp;
 				cb->l2rcb_zb = *zb;
 				cb->l2rcb_flags = zio_flags;
+				cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
 
 				/*
 				 * l2arc read.  The SCL_L2ARC lock will be
 				 * released by l2arc_read_done().
+				 * Issue a null zio if the underlying buffer
+				 * was squashed to zero size by compression.
 				 */
-				rzio = zio_read_phys(pio, vd, addr, size,
-				    buf->b_data, ZIO_CHECKSUM_OFF,
-				    l2arc_read_done, cb, priority, zio_flags |
-				    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
-				    ZIO_FLAG_DONT_PROPAGATE |
-				    ZIO_FLAG_DONT_RETRY, B_FALSE);
+				if (hdr->b_l2hdr->b_compress ==
+				    ZIO_COMPRESS_EMPTY) {
+					rzio = zio_null(pio, spa, vd,
+					    l2arc_read_done, cb,
+					    zio_flags | ZIO_FLAG_DONT_CACHE |
+					    ZIO_FLAG_CANFAIL |
+					    ZIO_FLAG_DONT_PROPAGATE |
+					    ZIO_FLAG_DONT_RETRY);
+				} else {
+					rzio = zio_read_phys(pio, vd, addr,
+					    hdr->b_l2hdr->b_asize,
+					    buf->b_data, ZIO_CHECKSUM_OFF,
+					    l2arc_read_done, cb, priority,
+					    zio_flags | ZIO_FLAG_DONT_CACHE |
+					    ZIO_FLAG_CANFAIL |
+					    ZIO_FLAG_DONT_PROPAGATE |
+					    ZIO_FLAG_DONT_RETRY, B_FALSE);
+				}
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
-				ARCSTAT_INCR(arcstat_l2_read_bytes, size);
+				ARCSTAT_INCR(arcstat_l2_read_bytes,
+				    hdr->b_l2hdr->b_asize);
 
 				if (*arc_flags & ARC_NOWAIT) {
 					zio_nowait(rzio);
@@ -3176,6 +3257,34 @@ arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
 }
 
 /*
+ * Notify the arc that a block was freed, and thus will never be used again.
+ */
+void
+arc_freed(spa_t *spa, const blkptr_t *bp)
+{
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_lock;
+	uint64_t guid = spa_load_guid(spa);
+
+	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+	    &hash_lock);
+	if (hdr == NULL)
+		return;
+	if (HDR_BUF_AVAILABLE(hdr)) {
+		arc_buf_t *buf = hdr->b_buf;
+		add_reference(hdr, hash_lock, FTAG);
+		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+		mutex_exit(hash_lock);
+
+		arc_release(buf, FTAG);
+		(void) arc_buf_remove_ref(buf, FTAG);
+	} else {
+		mutex_exit(hash_lock);
+	}
+
+}
+
+/*
  * This is used by the DMU to let the ARC know that a buffer is
  * being evicted, so the ARC should clean up.  If this arc buf
  * is not yet in the evicted state, it will be put there.
@@ -3332,6 +3441,16 @@ arc_release(arc_buf_t *buf, void *tag)
 			ASSERT3U(*size, >=, hdr->b_size);
 			atomic_add_64(size, -hdr->b_size);
 		}
+
+		/*
+		 * We're releasing a duplicate user data buffer, update
+		 * our statistics accordingly.
+		 */
+		if (hdr->b_type == ARC_BUFC_DATA) {
+			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
+			    -hdr->b_size);
+		}
 		hdr->b_datacnt -= 1;
 		arc_cksum_verify(buf);
 
@@ -3370,26 +3489,15 @@ arc_release(arc_buf_t *buf, void *tag)
 	buf->b_private = NULL;
 
 	if (l2hdr) {
+		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
 		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
 		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+		arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
 		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
 		mutex_exit(&l2arc_buflist_mtx);
 	}
 }
 
-/*
- * Release this buffer.  If it does not match the provided BP, fill it
- * with that block's contents.
- */
-/* ARGSUSED */
-int
-arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
-    zbookmark_t *zb)
-{
-	arc_release(buf, tag);
-	return (0);
-}
-
 int
 arc_released(arc_buf_t *buf)
 {
@@ -3526,9 +3634,9 @@ arc_write_done(zio_t *zio)
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
-    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private,
-    int priority, int zio_flags, const zbookmark_t *zb)
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
+    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
+    void *private, int priority, int zio_flags, const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
@@ -3541,7 +3649,9 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
 	ASSERT(hdr->b_acb == NULL);
 	if (l2arc)
 		hdr->b_flags |= ARC_L2CACHE;
-	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+	if (l2arc_compress)
+		hdr->b_flags |= ARC_L2COMPRESS;
+	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE);
 	callback->awcb_ready = ready;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
@@ -3557,48 +3667,19 @@ static int
 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
 {
 #ifdef _KERNEL
-	uint64_t available_memory = ptob(freemem);
-	static uint64_t page_load = 0;
-	static uint64_t last_txg = 0;
+	uint64_t available_memory;
 
-#if defined(__i386)
-	available_memory =
-	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
-#endif
-	if (available_memory >= zfs_write_limit_max)
+	if (zfs_arc_memory_throttle_disable)
 		return (0);
 
-	if (txg > last_txg) {
-		last_txg = txg;
-		page_load = 0;
-	}
-	/*
-	 * If we are in pageout, we know that memory is already tight,
-	 * the arc is already going to be evicting, so we just want to
-	 * continue to let page writes occur as quickly as possible.
-	 */
-	if (curproc == proc_pageout) {
-		if (page_load > MAX(ptob(minfree), available_memory) / 4)
-			return (ERESTART);
-		/* Note: reserve is inflated, so we deflate */
-		page_load += reserve / 8;
-		return (0);
-	} else if (page_load > 0 && arc_reclaim_needed()) {
-		/* memory is low, delay before restarting */
+	/* Easily reclaimable memory (free + inactive + arc-evictable) */
+	available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();
+
+	if (available_memory <= zfs_write_limit_max) {
 		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
 		DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
 		return (EAGAIN);
 	}
-	page_load = 0;
-
-	if (arc_size > arc_c_min) {
-		uint64_t evictable_memory =
-		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
-		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
-		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
-		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
-		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
-	}
 
 	if (inflight_data > available_memory / 4) {
 		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
@@ -3725,7 +3806,7 @@ arc_init(void)
 	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
 
 	/* Convert seconds to clock ticks */
-	arc_min_prefetch_lifespan = 1 * hz;
+	zfs_arc_min_prefetch_lifespan = 1 * hz;
 
 	/* Start out with 1/8 of all memory */
 	arc_c = physmem * PAGESIZE / 8;
@@ -3747,11 +3828,7 @@ arc_init(void)
 
 	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
 	arc_c_min = MAX(arc_c / 4, 64<<20);
-	/* set max to 1/2 of all memory, or all but 4GB, whichever is more */
-	if (arc_c * 8 >= ((uint64_t)4<<30))
-		arc_c_max = (arc_c * 8) - ((uint64_t)4<<30);
-	else
-		arc_c_max = arc_c_min;
+	/* set max to 1/2 of all memory */
 	arc_c_max = MAX(arc_c * 4, arc_c_max);
 
 	/*
@@ -3777,18 +3854,6 @@ arc_init(void)
 	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
 		arc_c_min = arc_meta_limit / 2;
 
-	if (zfs_arc_grow_retry > 0)
-		arc_grow_retry = zfs_arc_grow_retry;
-
-	if (zfs_arc_shrink_shift > 0)
-		arc_shrink_shift = zfs_arc_shrink_shift;
-
-	if (zfs_arc_p_min_shift > 0)
-		arc_p_min_shift = zfs_arc_p_min_shift;
-
-	if (zfs_arc_meta_prune > 0)
-		arc_meta_prune = zfs_arc_meta_prune;
-
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
@@ -3850,7 +3915,7 @@ arc_init(void)
 		kstat_install(arc_ksp);
 	}
 
-	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+	(void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
 	    TS_RUN, minclsyspri);
 
 	arc_dead = FALSE;
@@ -3980,8 +4045,12 @@ arc_fini(void)
  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
  * It does this by periodically scanning buffers from the eviction-end of
  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
- * not already there.  It scans until a headroom of buffers is satisfied,
- * which itself is a buffer for ARC eviction.  The thread that does this is
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. If a compressible buffer is
+ * found during scanning and selected for writing to an L2ARC device, we
+ * temporarily boost scanning headroom during the next scan cycle to make
+ * sure we adapt to compression effects (which might significantly reduce
+ * the data volume we write to L2ARC). The thread that does this is
  * l2arc_feed_thread(), illustrated below; example sizes are included to
  * provide a better sense of ratio than this diagram:
  *
@@ -4045,7 +4114,13 @@ arc_fini(void)
  *	l2arc_write_max		max write bytes per interval
  *	l2arc_write_boost	extra write bytes during device warmup
  *	l2arc_noprefetch	skip caching prefetched buffers
+ *	l2arc_nocompress	skip compressing buffers
  *	l2arc_headroom		number of max device writes to precache
+ *	l2arc_headroom_boost	when we find compressed buffers during ARC
+ *				scanning, we multiply headroom by this
+ *				percentage factor for the next scan cycle,
+ *				since more compressed buffers are likely to
+ *				be present
  *	l2arc_feed_secs		seconds between L2ARC writing
  *
  * Tunables may be removed or added as future performance improvements are
@@ -4079,14 +4154,24 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
 }
 
 static uint64_t
-l2arc_write_size(l2arc_dev_t *dev)
+l2arc_write_size(void)
 {
 	uint64_t size;
 
-	size = dev->l2ad_write;
+	/*
+	 * Make sure our globals have meaningful values in case the user
+	 * altered them.
+	 */
+	size = l2arc_write_max;
+	if (size == 0) {
+		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
+		    "be greater than zero, resetting it to the default (%d)",
+		    L2ARC_WRITE_SIZE);
+		size = l2arc_write_max = L2ARC_WRITE_SIZE;
+	}
 
 	if (arc_warm == B_FALSE)
-		size += dev->l2ad_boost;
+		size += l2arc_write_boost;
 
 	return (size);
 
@@ -4117,14 +4202,14 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 static void
 l2arc_hdr_stat_add(void)
 {
-	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE);
 	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
 }
 
 static void
 l2arc_hdr_stat_remove(void)
 {
-	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
+	ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE);
 	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
 }
 
@@ -4260,14 +4345,23 @@ l2arc_write_done(zio_t *zio)
 			continue;
 		}
 
+		abl2 = ab->b_l2hdr;
+
+		/*
+		 * Release the temporary compressed buffer as soon as possible.
+		 */
+		if (abl2->b_compress != ZIO_COMPRESS_OFF)
+			l2arc_release_cdata_buf(ab);
+
 		if (zio->io_error != 0) {
 			/*
 			 * Error - drop L2ARC entry.
 			 */
 			list_remove(buflist, ab);
-			abl2 = ab->b_l2hdr;
+			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
 			ab->b_l2hdr = NULL;
 			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+			arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
 			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
 		}
 
@@ -4318,6 +4412,13 @@ l2arc_read_done(zio_t *zio)
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	/*
+	 * If the buffer was compressed, decompress it first.
+	 */
+	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
+		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
+	ASSERT(zio->io_data != NULL);
+
+	/*
 	 * Check this survived the L2ARC journey.
 	 */
 	equal = arc_cksum_equal(buf);
@@ -4512,8 +4613,10 @@ top:
 			 */
 			if (ab->b_l2hdr != NULL) {
 				abl2 = ab->b_l2hdr;
+				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
 				ab->b_l2hdr = NULL;
 				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+				arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
 				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
 			}
 			list_remove(buflist, ab);
@@ -4537,37 +4640,54 @@ top:
  *
  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
+ * The headroom_boost is an in-out parameter used to maintain headroom boost
+ * state between calls to this function.
+ *
+ * Returns the number of bytes actually written (which may be smaller than
+ * the delta by which the device hand has changed due to alignment).
  */
 static uint64_t
-l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
+    boolean_t *headroom_boost)
 {
 	arc_buf_hdr_t *ab, *ab_prev, *head;
-	l2arc_buf_hdr_t *hdrl2;
 	list_t *list;
-	uint64_t passed_sz, write_sz, buf_sz, headroom;
+	uint64_t write_asize, write_psize, write_sz, headroom,
+	    buf_compress_minsz;
 	void *buf_data;
-	kmutex_t *hash_lock, *list_lock = NULL;
-	boolean_t have_lock, full;
+	kmutex_t *list_lock = NULL;
+	boolean_t full;
 	l2arc_write_callback_t *cb;
 	zio_t *pio, *wzio;
-	uint64_t guid = spa_guid(spa);
+	uint64_t guid = spa_load_guid(spa);
 	int try;
+	const boolean_t do_headroom_boost = *headroom_boost;
 
 	ASSERT(dev->l2ad_vdev != NULL);
 
+	/* Lower the flag now, we might want to raise it again later. */
+	*headroom_boost = B_FALSE;
+
 	pio = NULL;
-	write_sz = 0;
+	write_sz = write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
 	head->b_flags |= ARC_L2_WRITE_HEAD;
 
 	/*
+	 * We will want to try to compress buffers that are at least 2x the
+	 * device sector size.
+	 */
+	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
+
+	/*
 	 * Copy buffers for L2ARC writing.
 	 */
 	mutex_enter(&l2arc_buflist_mtx);
 	for (try = 0; try <= 3; try++) {
+		uint64_t passed_sz = 0;
+
 		list = l2arc_list_locked(try, &list_lock);
-		passed_sz = 0;
 
 		/*
 		 * L2ARC fast warmup.
@@ -4575,21 +4695,27 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
-		headroom = target_sz * l2arc_headroom;
 		if (arc_warm == B_FALSE)
 			ab = list_head(list);
 		else
 			ab = list_tail(list);
 
+		headroom = target_sz * l2arc_headroom;
+		if (do_headroom_boost)
+			headroom = (headroom * l2arc_headroom_boost) / 100;
+
 		for (; ab; ab = ab_prev) {
+			l2arc_buf_hdr_t *l2hdr;
+			kmutex_t *hash_lock;
+			uint64_t buf_sz;
+
 			if (arc_warm == B_FALSE)
 				ab_prev = list_next(list, ab);
 			else
 				ab_prev = list_prev(list, ab);
 
 			hash_lock = HDR_LOCK(ab);
-			have_lock = MUTEX_HELD(hash_lock);
-			if (!have_lock && !mutex_tryenter(hash_lock)) {
+			if (!mutex_tryenter(hash_lock)) {
 				/*
 				 * Skip this buffer rather than waiting.
 				 */
@@ -4635,16 +4761,29 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			/*
 			 * Create and add a new L2ARC header.
 			 */
-			hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
-			                    KM_PUSHPAGE);
-			hdrl2->b_dev = dev;
-			hdrl2->b_daddr = dev->l2ad_hand;
+			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
+			    KM_PUSHPAGE);
+			l2hdr->b_dev = dev;
+			arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS);
 
 			ab->b_flags |= ARC_L2_WRITING;
-			ab->b_l2hdr = hdrl2;
-			list_insert_head(dev->l2ad_buflist, ab);
-			buf_data = ab->b_buf->b_data;
+
+			/*
+			 * Temporarily stash the data buffer in b_tmp_cdata.
+			 * The subsequent write step will pick it up from
+			 * there. This is because can't access ab->b_buf
+			 * without holding the hash_lock, which we in turn
+			 * can't access without holding the ARC list locks
+			 * (which we want to avoid during compression/writing)
+			 */
+			l2hdr->b_compress = ZIO_COMPRESS_OFF;
+			l2hdr->b_asize = ab->b_size;
+			l2hdr->b_tmp_cdata = ab->b_buf->b_data;
+
 			buf_sz = ab->b_size;
+			ab->b_l2hdr = l2hdr;
+
+			list_insert_head(dev->l2ad_buflist, ab);
 
 			/*
 			 * Compute and store the buffer cksum before
@@ -4655,6 +4794,64 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 
 			mutex_exit(hash_lock);
 
+			write_sz += buf_sz;
+		}
+
+		mutex_exit(list_lock);
+
+		if (full == B_TRUE)
+			break;
+	}
+
+	/* No buffers selected for writing? */
+	if (pio == NULL) {
+		ASSERT0(write_sz);
+		mutex_exit(&l2arc_buflist_mtx);
+		kmem_cache_free(hdr_cache, head);
+		return (0);
+	}
+
+	/*
+	 * Now start writing the buffers. We're starting at the write head
+	 * and work backwards, retracing the course of the buffer selector
+	 * loop above.
+	 */
+	for (ab = list_prev(dev->l2ad_buflist, head); ab;
+	    ab = list_prev(dev->l2ad_buflist, ab)) {
+		l2arc_buf_hdr_t *l2hdr;
+		uint64_t buf_sz;
+
+		/*
+		 * We shouldn't need to lock the buffer here, since we flagged
+		 * it as ARC_L2_WRITING in the previous step, but we must take
+		 * care to only access its L2 cache parameters. In particular,
+		 * ab->b_buf may be invalid by now due to ARC eviction.
+		 */
+		l2hdr = ab->b_l2hdr;
+		l2hdr->b_daddr = dev->l2ad_hand;
+
+		if (!l2arc_nocompress && (ab->b_flags & ARC_L2COMPRESS) &&
+		    l2hdr->b_asize >= buf_compress_minsz) {
+			if (l2arc_compress_buf(l2hdr)) {
+				/*
+				 * If compression succeeded, enable headroom
+				 * boost on the next scan cycle.
+				 */
+				*headroom_boost = B_TRUE;
+			}
+		}
+
+		/*
+		 * Pick up the buffer data we had previously stashed away
+		 * (and now potentially also compressed).
+		 */
+		buf_data = l2hdr->b_tmp_cdata;
+		buf_sz = l2hdr->b_asize;
+
+		/* Compression may have squashed the buffer to zero length. */
+		if (buf_sz != 0) {
+			uint64_t buf_p_sz;
+
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
 			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
 			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
@@ -4664,33 +4861,24 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			    zio_t *, wzio);
 			(void) zio_nowait(wzio);
 
+			write_asize += buf_sz;
 			/*
 			 * Keep the clock hand suitably device-aligned.
 			 */
-			buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
-
-			write_sz += buf_sz;
-			dev->l2ad_hand += buf_sz;
+			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+			write_psize += buf_p_sz;
+			dev->l2ad_hand += buf_p_sz;
 		}
-
-		mutex_exit(list_lock);
-
-		if (full == B_TRUE)
-			break;
 	}
-	mutex_exit(&l2arc_buflist_mtx);
 
-	if (pio == NULL) {
-		ASSERT3U(write_sz, ==, 0);
-		kmem_cache_free(hdr_cache, head);
-		return (0);
-	}
+	mutex_exit(&l2arc_buflist_mtx);
 
-	ASSERT3U(write_sz, <=, target_sz);
+	ASSERT3U(write_asize, <=, target_sz);
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
-	ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
+	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
 	ARCSTAT_INCR(arcstat_l2_size, write_sz);
-	vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
+	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
+	vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
 
 	/*
 	 * Bump device hand to the device start if it is approaching the end.
@@ -4708,7 +4896,153 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	(void) zio_wait(pio);
 	dev->l2ad_writing = B_FALSE;
 
-	return (write_sz);
+	return (write_asize);
+}
+
+/*
+ * Compresses an L2ARC buffer.
+ * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
+ * size in l2hdr->b_asize. This routine tries to compress the data and
+ * depending on the compression result there are three possible outcomes:
+ * *) The buffer was incompressible. The original l2hdr contents were left
+ *    untouched and are ready for writing to an L2 device.
+ * *) The buffer was all-zeros, so there is no need to write it to an L2
+ *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
+ *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
+ * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
+ *    data buffer which holds the compressed data to be written, and b_asize
+ *    tells us how much data there is. b_compress is set to the appropriate
+ *    compression algorithm. Once writing is done, invoke
+ *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
+ *
+ * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
+ * buffer was incompressible).
+ */
+static boolean_t
+l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
+{
+	void *cdata;
+	size_t csize, len;
+
+	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
+	ASSERT(l2hdr->b_tmp_cdata != NULL);
+
+	len = l2hdr->b_asize;
+	cdata = zio_data_buf_alloc(len);
+	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
+	    cdata, l2hdr->b_asize);
+
+	if (csize == 0) {
+		/* zero block, indicate that there's nothing to write */
+		zio_data_buf_free(cdata, len);
+		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
+		l2hdr->b_asize = 0;
+		l2hdr->b_tmp_cdata = NULL;
+		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
+		return (B_TRUE);
+	} else if (csize > 0 && csize < len) {
+		/*
+		 * Compression succeeded, we'll keep the cdata around for
+		 * writing and release it afterwards.
+		 */
+		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
+		l2hdr->b_asize = csize;
+		l2hdr->b_tmp_cdata = cdata;
+		ARCSTAT_BUMP(arcstat_l2_compress_successes);
+		return (B_TRUE);
+	} else {
+		/*
+		 * Compression failed, release the compressed buffer.
+		 * l2hdr will be left unmodified.
+		 */
+		zio_data_buf_free(cdata, len);
+		ARCSTAT_BUMP(arcstat_l2_compress_failures);
+		return (B_FALSE);
+	}
+}
+
+/*
+ * Decompresses a zio read back from an l2arc device. On success, the
+ * underlying zio's io_data buffer is overwritten by the uncompressed
+ * version. On decompression error (corrupt compressed stream), the
+ * zio->io_error value is set to signal an I/O error.
+ *
+ * Please note that the compressed data stream is not checksummed, so
+ * if the underlying device is experiencing data corruption, we may feed
+ * corrupt data to the decompressor, so the decompressor needs to be
+ * able to handle this situation (LZ4 does).
+ */
+static void
+l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
+{
+	uint64_t csize;
+	void *cdata;
+
+	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
+
+	if (zio->io_error != 0) {
+		/*
+		 * An io error has occured, just restore the original io
+		 * size in preparation for a main pool read.
+		 */
+		zio->io_orig_size = zio->io_size = hdr->b_size;
+		return;
+	}
+
+	if (c == ZIO_COMPRESS_EMPTY) {
+		/*
+		 * An empty buffer results in a null zio, which means we
+		 * need to fill its io_data after we're done restoring the
+		 * buffer's contents.
+		 */
+		ASSERT(hdr->b_buf != NULL);
+		bzero(hdr->b_buf->b_data, hdr->b_size);
+		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
+	} else {
+		ASSERT(zio->io_data != NULL);
+		/*
+		 * We copy the compressed data from the start of the arc buffer
+		 * (the zio_read will have pulled in only what we need, the
+		 * rest is garbage which we will overwrite at decompression)
+		 * and then decompress back to the ARC data buffer. This way we
+		 * can minimize copying by simply decompressing back over the
+		 * original compressed data (rather than decompressing to an
+		 * aux buffer and then copying back the uncompressed buffer,
+		 * which is likely to be much larger).
+		 */
+		csize = zio->io_size;
+		cdata = zio_data_buf_alloc(csize);
+		bcopy(zio->io_data, cdata, csize);
+		if (zio_decompress_data(c, cdata, zio->io_data, csize,
+		    hdr->b_size) != 0)
+			zio->io_error = EIO;
+		zio_data_buf_free(cdata, csize);
+	}
+
+	/* Restore the expected uncompressed IO size. */
+	zio->io_orig_size = zio->io_size = hdr->b_size;
+}
+
+/*
+ * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
+ * This buffer serves as a temporary holder of compressed data while
+ * the buffer entry is being written to an l2arc device. Once that is
+ * done, we can dispose of it.
+ */
+static void
+l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
+{
+	l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
+
+	if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
+		/*
+		 * If the data was compressed, then we've allocated a
+		 * temporary buffer for it, so now we need to release it.
+		 */
+		ASSERT(l2hdr->b_tmp_cdata != NULL);
+		zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
+	}
+	l2hdr->b_tmp_cdata = NULL;
 }
 
 /*
@@ -4723,6 +5057,7 @@ l2arc_feed_thread(void)
 	spa_t *spa;
 	uint64_t size, wrote;
 	clock_t begin, next = ddi_get_lbolt();
+	boolean_t headroom_boost = B_FALSE;
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
@@ -4775,7 +5110,7 @@ l2arc_feed_thread(void)
 		/*
 		 * Avoid contributing to memory pressure.
 		 */
-		if (arc_reclaim_needed()) {
+		if (arc_no_grow) {
 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
@@ -4783,7 +5118,7 @@ l2arc_feed_thread(void)
 
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
-		size = l2arc_write_size(dev);
+		size = l2arc_write_size();
 
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
@@ -4793,7 +5128,7 @@ l2arc_feed_thread(void)
 		/*
 		 * Write ARC buffers.
 		 */
-		wrote = l2arc_write_buffers(spa, dev, size);
+		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
 
 		/*
 		 * Calculate interval between writes.
@@ -4841,8 +5176,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
 	adddev->l2ad_spa = spa;
 	adddev->l2ad_vdev = vd;
-	adddev->l2ad_write = l2arc_write_max;
-	adddev->l2ad_boost = l2arc_write_boost;
 	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	adddev->l2ad_hand = adddev->l2ad_start;
@@ -4850,7 +5183,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
 	list_link_init(&adddev->l2ad_node);
-	ASSERT3U(adddev->l2ad_write, >, 0);
 
 	/*
 	 * This is a list of all ARC buffers that are still valid on the
@@ -4983,49 +5315,64 @@ EXPORT_SYMBOL(arc_getbuf_func);
 EXPORT_SYMBOL(arc_add_prune_callback);
 EXPORT_SYMBOL(arc_remove_prune_callback);
 
-module_param(zfs_arc_min, ulong, 0444);
+module_param(zfs_arc_min, ulong, 0644);
 MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
 
-module_param(zfs_arc_max, ulong, 0444);
+module_param(zfs_arc_max, ulong, 0644);
 MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
 
-module_param(zfs_arc_meta_limit, ulong, 0444);
+module_param(zfs_arc_meta_limit, ulong, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
 
-module_param(zfs_arc_meta_prune, int, 0444);
+module_param(zfs_arc_meta_prune, int, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
 
-module_param(zfs_arc_grow_retry, int, 0444);
+module_param(zfs_arc_grow_retry, int, 0644);
 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
 
-module_param(zfs_arc_shrink_shift, int, 0444);
+module_param(zfs_arc_shrink_shift, int, 0644);
 MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
 
-module_param(zfs_arc_p_min_shift, int, 0444);
+module_param(zfs_arc_p_min_shift, int, 0644);
 MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
 
-module_param(l2arc_write_max, ulong, 0444);
+module_param(zfs_disable_dup_eviction, int, 0644);
+MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
+
+module_param(zfs_arc_memory_throttle_disable, int, 0644);
+MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
+
+module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
+
+module_param(l2arc_write_max, ulong, 0644);
 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
 
-module_param(l2arc_write_boost, ulong, 0444);
+module_param(l2arc_write_boost, ulong, 0644);
 MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
 
-module_param(l2arc_headroom, ulong, 0444);
+module_param(l2arc_headroom, ulong, 0644);
 MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
 
-module_param(l2arc_feed_secs, ulong, 0444);
+module_param(l2arc_headroom_boost, ulong, 0644);
+MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
+
+module_param(l2arc_feed_secs, ulong, 0644);
 MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
 
-module_param(l2arc_feed_min_ms, ulong, 0444);
+module_param(l2arc_feed_min_ms, ulong, 0644);
 MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
 
-module_param(l2arc_noprefetch, int, 0444);
+module_param(l2arc_noprefetch, int, 0644);
 MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
 
-module_param(l2arc_feed_again, int, 0444);
+module_param(l2arc_nocompress, int, 0644);
+MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
+
+module_param(l2arc_feed_again, int, 0644);
 MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
 
-module_param(l2arc_norw, int, 0444);
+module_param(l2arc_norw, int, 0644);
 MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
 
 #endif