X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Farc.c;h=f71c24844e8b3821d220de97ab90c0932476f7e5;hb=23bdb07d4e4c435205d25d3efdb5fef2d089ce5e;hp=d5e5aa544bfd09112f53a37e694208a1b9eb70cd;hpb=9babb37438b58e77bad04e820d5702e15b79e6a6;p=zfs.git

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index d5e5aa5..f71c248 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -119,7 +118,6 @@
 
 #include <sys/spa.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
@@ -133,6 +131,7 @@
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
+#include <zfs_fletcher.h>
 
 static kmutex_t		arc_reclaim_thr_lock;
 static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
@@ -175,13 +174,13 @@ static boolean_t arc_warm;
 /*
  * These tunables are for performance analysis.
  */
-uint64_t zfs_arc_max;
-uint64_t zfs_arc_min;
-uint64_t zfs_arc_meta_limit = 0;
-int zfs_mdcomp_disable = 0;
+unsigned long zfs_arc_max = 0;
+unsigned long zfs_arc_min = 0;
+unsigned long zfs_arc_meta_limit = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
+int zfs_arc_reduce_dnlc_percent = 0;
 
 /*
  * Note that buffers can be in one of 6 states:
@@ -249,6 +248,9 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_recycle_miss;
 	kstat_named_t arcstat_mutex_miss;
 	kstat_named_t arcstat_evict_skip;
+	kstat_named_t arcstat_evict_l2_cached;
+	kstat_named_t arcstat_evict_l2_eligible;
+	kstat_named_t arcstat_evict_l2_ineligible;
 	kstat_named_t arcstat_hash_elements;
 	kstat_named_t arcstat_hash_elements_max;
 	kstat_named_t arcstat_hash_collisions;
@@ -281,6 +283,14 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_l2_size;
 	kstat_named_t arcstat_l2_hdr_size;
 	kstat_named_t arcstat_memory_throttle_count;
+	kstat_named_t arcstat_memory_direct_count;
+	kstat_named_t arcstat_memory_indirect_count;
+	kstat_named_t arcstat_no_grow;
+	kstat_named_t arcstat_tempreserve;
+	kstat_named_t arcstat_loaned_bytes;
+	kstat_named_t arcstat_meta_used;
+	kstat_named_t arcstat_meta_limit;
+	kstat_named_t arcstat_meta_max;
 } arc_stats_t;
 
 static arc_stats_t arc_stats = {
@@ -302,6 +312,9 @@ static arc_stats_t arc_stats = {
 	{ "recycle_miss",		KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
+	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
+	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
+	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
@@ -333,7 +346,15 @@ static arc_stats_t arc_stats = {
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
-	{ "memory_throttle_count",	KSTAT_DATA_UINT64 }
+	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
+	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
+	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
+	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
+	{ "arc_tempreserve",		KSTAT_DATA_UINT64 },
+	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
 };
 
 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
@@ -341,7 +362,7 @@ static arc_stats_t arc_stats = {
 #define	ARCSTAT_INCR(stat, val) \
 	atomic_add_64(&arc_stats.stat.value.ui64, (val));
 
-#define	ARCSTAT_BUMP(stat) 	ARCSTAT_INCR(stat, 1)
+#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
 #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
 
 #define	ARCSTAT_MAX(stat, val) {					\
@@ -375,7 +396,7 @@ static arc_stats_t arc_stats = {
 	}
 
 kstat_t			*arc_ksp;
-static arc_state_t 	*arc_anon;
+static arc_state_t	*arc_anon;
 static arc_state_t	*arc_mru;
 static arc_state_t	*arc_mru_ghost;
 static arc_state_t	*arc_mfu;
@@ -395,13 +416,12 @@ static arc_state_t	*arc_l2c_only;
 #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
-
-static int		arc_no_grow;	/* Don't try to grow cache size */
-static uint64_t		arc_tempreserve;
-static uint64_t		arc_loaned_bytes;
-static uint64_t		arc_meta_used;
-static uint64_t		arc_meta_limit;
-static uint64_t		arc_meta_max = 0;
+#define	arc_no_grow	ARCSTAT(arcstat_no_grow)
+#define	arc_tempreserve	ARCSTAT(arcstat_tempreserve)
+#define	arc_loaned_bytes	ARCSTAT(arcstat_loaned_bytes)
+#define	arc_meta_used	ARCSTAT(arcstat_meta_used)
+#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit)
+#define	arc_meta_max	ARCSTAT(arcstat_meta_max)
 
 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 
@@ -432,6 +452,7 @@ struct arc_buf_hdr {
 
 	kmutex_t		b_freeze_lock;
 	zio_cksum_t		*b_freeze_cksum;
+	void			*b_thawed;
 
 	arc_buf_hdr_t		*b_hash_next;
 	arc_buf_t		*b_buf;
@@ -468,6 +489,8 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 static int arc_evict_needed(arc_buf_contents_t type);
 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 
+static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
+
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
 	(state) == arc_l2c_only)
@@ -490,7 +513,6 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 #define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
 #define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
 #define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
-#define	ARC_STORED		(1 << 19)	/* has been store()d to */
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
@@ -517,12 +539,13 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
  * Hash table routines
  */
 
-#define	HT_LOCK_PAD	64
+#define	HT_LOCK_ALIGN	64
+#define	HT_LOCK_PAD	(P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
 
 struct ht_lock {
 	kmutex_t	ht_lock;
 #ifdef _KERNEL
-	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+	unsigned char	pad[HT_LOCK_PAD];
 #endif
 };
 
@@ -539,8 +562,8 @@ static buf_hash_table_t buf_hash_table;
 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define	HDR_LOCK(buf) \
-	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+#define	HDR_LOCK(hdr) \
+	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];
 
@@ -559,14 +582,14 @@ uint64_t zfs_crc64_table[256];
 /*
  * L2ARC Performance Tunables
  */
-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
-uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
-boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
-boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
-boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
+unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
+unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
+unsigned long l2arc_headroom = L2ARC_HEADROOM;		/* # of dev writes */
+unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
+unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
+int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
+int l2arc_feed_again = B_TRUE;			/* turbo warmup */
+int l2arc_norw = B_TRUE;			/* no reads during writes */
 
 /*
  * L2ARC Internals
@@ -658,6 +681,15 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+	hdr->b_dva.dva_word[0] = 0;
+	hdr->b_dva.dva_word[1] = 0;
+	hdr->b_birth = 0;
+	hdr->b_cksum0 = 0;
+}
+
 static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 {
@@ -757,8 +789,15 @@ buf_fini(void)
 {
 	int i;
 
+#if defined(_KERNEL) && defined(HAVE_SPL)
+	/* Large allocations which do not require contiguous pages
+	 * should be using vmem_free() in the linux kernel */
+	vmem_free(buf_hash_table.ht_table,
+	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#else
 	kmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#endif
 	for (i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 	kmem_cache_destroy(hdr_cache);
@@ -779,6 +818,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag)
 	refcount_create(&buf->b_refcnt);
 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_link_init(&buf->b_arc_node);
+	list_link_init(&buf->b_l2node);
 	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 
 	return (0);
@@ -791,7 +832,8 @@ buf_cons(void *vbuf, void *unused, int kmflag)
 	arc_buf_t *buf = vbuf;
 
 	bzero(buf, sizeof (arc_buf_t));
-	rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+	rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
 	return (0);
@@ -807,6 +849,7 @@ hdr_dest(void *vbuf, void *unused)
 {
 	arc_buf_hdr_t *buf = vbuf;
 
+	ASSERT(BUF_EMPTY(buf));
 	refcount_destroy(&buf->b_refcnt);
 	cv_destroy(&buf->b_cv);
 	mutex_destroy(&buf->b_freeze_lock);
@@ -819,7 +862,8 @@ buf_dest(void *vbuf, void *unused)
 {
 	arc_buf_t *buf = vbuf;
 
-	rw_destroy(&buf->b_lock);
+	mutex_destroy(&buf->b_evict_lock);
+	rw_destroy(&buf->b_data_lock);
 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
@@ -855,8 +899,15 @@ buf_init(void)
 		hsize <<= 1;
 retry:
 	buf_hash_table.ht_mask = hsize - 1;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+	/* Large allocations which do not require contiguous pages
+	 * should be using vmem_alloc() in the linux kernel */
+	buf_hash_table.ht_table =
+	    vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+#else
 	buf_hash_table.ht_table =
 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+#endif
 	if (buf_hash_table.ht_table == NULL) {
 		ASSERT(hsize > (1ULL << 8));
 		hsize >>= 1;
@@ -947,18 +998,31 @@ arc_buf_thaw(arc_buf_t *buf)
 		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 		buf->b_hdr->b_freeze_cksum = NULL;
 	}
+
+	if (zfs_flags & ZFS_DEBUG_MODIFY) {
+		if (buf->b_hdr->b_thawed)
+			kmem_free(buf->b_hdr->b_thawed, 1);
+		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+	}
+
 	mutex_exit(&buf->b_hdr->b_freeze_lock);
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
+	kmutex_t *hash_lock;
+
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
+	hash_lock = HDR_LOCK(buf->b_hdr);
+	mutex_enter(hash_lock);
+
 	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
 	    buf->b_hdr->b_state == arc_anon);
 	arc_cksum_compute(buf, B_FALSE);
+	mutex_exit(hash_lock);
 }
 
 static void
@@ -1030,6 +1094,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 	ASSERT(new_state != old_state);
 	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
 	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
 
 	from_delta = to_delta = ab->b_datacnt * ab->b_size;
 
@@ -1050,7 +1115,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 
 			/*
 			 * If prefetching out of the ghost cache,
-			 * we will have a non-null datacnt.
+			 * we will have a non-zero datacnt.
 			 */
 			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
 				/* ghost elements have a ghost size */
@@ -1086,9 +1151,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 	}
 
 	ASSERT(!BUF_EMPTY(ab));
-	if (new_state == arc_anon) {
+	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
 		buf_hash_remove(ab);
-	}
 
 	/* adjust state sizes */
 	if (to_delta)
@@ -1112,6 +1176,8 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
+	default:
+		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, space);
 		break;
@@ -1136,6 +1202,8 @@ arc_space_return(uint64_t space, arc_space_type_t type)
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
+	default:
+		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, -space);
 		break;
@@ -1232,14 +1300,29 @@ arc_return_buf(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	ASSERT(hdr->b_state == arc_anon);
 	ASSERT(buf->b_data != NULL);
-	VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
-	VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
+	(void) refcount_add(&hdr->b_refcnt, tag);
+	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
 
 	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
 }
 
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+	arc_buf_hdr_t *hdr;
+
+	ASSERT(buf->b_data != NULL);
+	hdr = buf->b_hdr;
+	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
+	(void) refcount_remove(&hdr->b_refcnt, tag);
+	buf->b_efunc = NULL;
+	buf->b_private = NULL;
+
+	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+}
+
 static arc_buf_t *
 arc_buf_clone(arc_buf_t *from)
 {
@@ -1247,6 +1330,8 @@ arc_buf_clone(arc_buf_t *from)
 	arc_buf_hdr_t *hdr = from->b_hdr;
 	uint64_t size = hdr->b_size;
 
+	ASSERT(hdr->b_state != arc_anon);
+
 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
@@ -1271,16 +1356,16 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
 	 * must verify b_data != NULL to know if the add_ref
 	 * was successful.
 	 */
-	rw_enter(&buf->b_lock, RW_READER);
+	mutex_enter(&buf->b_evict_lock);
 	if (buf->b_data == NULL) {
-		rw_exit(&buf->b_lock);
+		mutex_exit(&buf->b_evict_lock);
 		return;
 	}
-	hdr = buf->b_hdr;
-	ASSERT(hdr != NULL);
-	hash_lock = HDR_LOCK(hdr);
+	hash_lock = HDR_LOCK(buf->b_hdr);
 	mutex_enter(hash_lock);
-	rw_exit(&buf->b_lock);
+	hdr = buf->b_hdr;
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+	mutex_exit(&buf->b_evict_lock);
 
 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
 	add_reference(hdr, hash_lock, tag);
@@ -1328,6 +1413,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 		arc_buf_contents_t type = buf->b_hdr->b_type;
 
 		arc_cksum_verify(buf);
+
 		if (!recycle) {
 			if (type == ARC_BUFC_METADATA) {
 				arc_buf_data_free(buf->b_hdr, zio_buf_free,
@@ -1365,6 +1451,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
 		continue;
 	*bufp = buf->b_next;
+	buf->b_next = NULL;
 
 	ASSERT(buf->b_efunc == NULL);
 
@@ -1376,58 +1463,59 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
+	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+
 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
 	ASSERT3P(hdr->b_state, ==, arc_anon);
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-	ASSERT(!(hdr->b_flags & ARC_STORED));
 
-	if (hdr->b_l2hdr != NULL) {
-		if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
-			/*
-			 * To prevent arc_free() and l2arc_evict() from
-			 * attempting to free the same buffer at the same time,
-			 * a FREE_IN_PROGRESS flag is given to arc_free() to
-			 * give it priority.  l2arc_evict() can't destroy this
-			 * header while we are waiting on l2arc_buflist_mtx.
-			 *
-			 * The hdr may be removed from l2ad_buflist before we
-			 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
-			 */
+	if (l2hdr != NULL) {
+		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+		/*
+		 * To prevent arc_free() and l2arc_evict() from
+		 * attempting to free the same buffer at the same time,
+		 * a FREE_IN_PROGRESS flag is given to arc_free() to
+		 * give it priority.  l2arc_evict() can't destroy this
+		 * header while we are waiting on l2arc_buflist_mtx.
+		 *
+		 * The hdr may be removed from l2ad_buflist before we
+		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+		 */
+		if (!buflist_held) {
 			mutex_enter(&l2arc_buflist_mtx);
-			if (hdr->b_l2hdr != NULL) {
-				list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
-				    hdr);
-			}
-			mutex_exit(&l2arc_buflist_mtx);
-		} else {
-			list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+			l2hdr = hdr->b_l2hdr;
 		}
-		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
-		kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
-		if (hdr->b_state == arc_l2c_only)
-			l2arc_hdr_stat_remove();
-		hdr->b_l2hdr = NULL;
+
+		if (l2hdr != NULL) {
+			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+			if (hdr->b_state == arc_l2c_only)
+				l2arc_hdr_stat_remove();
+			hdr->b_l2hdr = NULL;
+		}
+
+		if (!buflist_held)
+			mutex_exit(&l2arc_buflist_mtx);
 	}
 
 	if (!BUF_EMPTY(hdr)) {
 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
-		bzero(&hdr->b_dva, sizeof (dva_t));
-		hdr->b_birth = 0;
-		hdr->b_cksum0 = 0;
+		buf_discard_identity(hdr);
 	}
 	while (hdr->b_buf) {
 		arc_buf_t *buf = hdr->b_buf;
 
 		if (buf->b_efunc) {
 			mutex_enter(&arc_eviction_mtx);
-			rw_enter(&buf->b_lock, RW_WRITER);
+			mutex_enter(&buf->b_evict_lock);
 			ASSERT(buf->b_hdr != NULL);
 			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
 			hdr->b_buf = buf->b_next;
 			buf->b_hdr = &arc_eviction_hdr;
 			buf->b_next = arc_eviction_list;
 			arc_eviction_list = buf;
-			rw_exit(&buf->b_lock);
+			mutex_exit(&buf->b_evict_lock);
 			mutex_exit(&arc_eviction_mtx);
 		} else {
 			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
@@ -1437,6 +1525,10 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 		hdr->b_freeze_cksum = NULL;
 	}
+	if (hdr->b_thawed) {
+		kmem_free(hdr->b_thawed, 1);
+		hdr->b_thawed = NULL;
+	}
 
 	ASSERT(!list_link_active(&hdr->b_arc_node));
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1457,11 +1549,17 @@ arc_buf_free(arc_buf_t *buf, void *tag)
 		kmutex_t *hash_lock = HDR_LOCK(hdr);
 
 		mutex_enter(hash_lock);
+		hdr = buf->b_hdr;
+		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
 		(void) remove_reference(hdr, hash_lock, tag);
-		if (hdr->b_datacnt > 1)
+		if (hdr->b_datacnt > 1) {
 			arc_buf_destroy(buf, FALSE, TRUE);
-		else
+		} else {
+			ASSERT(buf == hdr->b_buf);
+			ASSERT(buf->b_efunc == NULL);
 			hdr->b_flags |= ARC_BUF_AVAILABLE;
+		}
 		mutex_exit(hash_lock);
 	} else if (HDR_IO_IN_PROGRESS(hdr)) {
 		int destroy_hdr;
@@ -1478,12 +1576,10 @@ arc_buf_free(arc_buf_t *buf, void *tag)
 		if (destroy_hdr)
 			arc_hdr_destroy(hdr);
 	} else {
-		if (remove_reference(hdr, NULL, tag) > 0) {
-			ASSERT(HDR_IO_ERROR(hdr));
+		if (remove_reference(hdr, NULL, tag) > 0)
 			arc_buf_destroy(buf, FALSE, TRUE);
-		} else {
+		else
 			arc_hdr_destroy(hdr);
-		}
 	}
 }
 
@@ -1495,11 +1591,14 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 	int no_callback = (buf->b_efunc == NULL);
 
 	if (hdr->b_state == arc_anon) {
+		ASSERT(hdr->b_datacnt == 1);
 		arc_buf_free(buf, tag);
 		return (no_callback);
 	}
 
 	mutex_enter(hash_lock);
+	hdr = buf->b_hdr;
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT(hdr->b_state != arc_anon);
 	ASSERT(buf->b_data != NULL);
 
@@ -1509,6 +1608,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 			arc_buf_destroy(buf, FALSE, TRUE);
 	} else if (no_callback) {
 		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+		ASSERT(buf->b_efunc == NULL);
 		hdr->b_flags |= ARC_BUF_AVAILABLE;
 	}
 	ASSERT(no_callback || hdr->b_datacnt > 1 ||
@@ -1561,7 +1661,8 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
 		if (HDR_IO_IN_PROGRESS(ab) ||
 		    (spa && ab->b_spa != spa) ||
 		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
-		    lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
+		    ddi_get_lbolt() - ab->b_arc_access <
+		    arc_min_prefetch_lifespan)) {
 			skipped++;
 			continue;
 		}
@@ -1576,7 +1677,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
 			ASSERT(ab->b_datacnt > 0);
 			while (ab->b_buf) {
 				arc_buf_t *buf = ab->b_buf;
-				if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
+				if (!mutex_tryenter(&buf->b_evict_lock)) {
 					missed += 1;
 					break;
 				}
@@ -1598,13 +1699,28 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
 					buf->b_next = arc_eviction_list;
 					arc_eviction_list = buf;
 					mutex_exit(&arc_eviction_mtx);
-					rw_exit(&buf->b_lock);
+					mutex_exit(&buf->b_evict_lock);
 				} else {
-					rw_exit(&buf->b_lock);
+					mutex_exit(&buf->b_evict_lock);
 					arc_buf_destroy(buf,
 					    buf->b_data == stolen, TRUE);
 				}
 			}
+
+			if (ab->b_l2hdr) {
+				ARCSTAT_INCR(arcstat_evict_l2_cached,
+				    ab->b_size);
+			} else {
+				if (l2arc_write_eligible(ab->b_spa, ab)) {
+					ARCSTAT_INCR(arcstat_evict_l2_eligible,
+					    ab->b_size);
+				} else {
+					ARCSTAT_INCR(
+					    arcstat_evict_l2_ineligible,
+					    ab->b_size);
+				}
+			}
+
 			if (ab->b_datacnt == 0) {
 				arc_change_state(evicted_state, ab, hash_lock);
 				ASSERT(HDR_IN_HASH_TABLE(ab));
@@ -1625,7 +1741,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
 	mutex_exit(&state->arcs_mtx);
 
 	if (bytes_evicted < bytes)
-		dprintf("only evicted %lld bytes from %x",
+		dprintf("only evicted %lld bytes from %x\n",
 		    (longlong_t)bytes_evicted, state);
 
 	if (skipped)
@@ -1646,12 +1762,12 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
 		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
 			int64_t todelete =
 			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-			arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+			arc_evict_ghost(arc_mru_ghost, 0, todelete);
 		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
 			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
 			    arc_mru_ghost->arcs_size +
 			    arc_mfu_ghost->arcs_size - arc_c);
-			arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+			arc_evict_ghost(arc_mfu_ghost, 0, todelete);
 		}
 	}
 
@@ -1666,19 +1782,29 @@ static void
 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
 {
 	arc_buf_hdr_t *ab, *ab_prev;
+	arc_buf_hdr_t marker;
 	list_t *list = &state->arcs_list[ARC_BUFC_DATA];
 	kmutex_t *hash_lock;
 	uint64_t bytes_deleted = 0;
 	uint64_t bufs_skipped = 0;
 
 	ASSERT(GHOST_STATE(state));
+	bzero(&marker, sizeof(marker));
 top:
 	mutex_enter(&state->arcs_mtx);
 	for (ab = list_tail(list); ab; ab = ab_prev) {
 		ab_prev = list_prev(list, ab);
 		if (spa && ab->b_spa != spa)
 			continue;
+
+		/* ignore markers */
+		if (ab->b_spa == 0)
+			continue;
+
 		hash_lock = HDR_LOCK(ab);
+		/* caller may be trying to modify this buffer, skip it */
+		if (MUTEX_HELD(hash_lock))
+			continue;
 		if (mutex_tryenter(hash_lock)) {
 			ASSERT(!HDR_IO_IN_PROGRESS(ab));
 			ASSERT(ab->b_buf == NULL);
@@ -1701,15 +1827,21 @@ top:
 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
 			if (bytes >= 0 && bytes_deleted >= bytes)
 				break;
-		} else {
-			if (bytes < 0) {
-				mutex_exit(&state->arcs_mtx);
-				mutex_enter(hash_lock);
-				mutex_exit(hash_lock);
-				goto top;
-			}
+		} else if (bytes < 0) {
+			/*
+			 * Insert a list marker and then wait for the
+			 * hash lock to become available. Once its
+			 * available, restart from where we left off.
+			 */
+			list_insert_after(list, ab, &marker);
+			mutex_exit(&state->arcs_mtx);
+			mutex_enter(hash_lock);
+			mutex_exit(hash_lock);
+			mutex_enter(&state->arcs_mtx);
+			ab_prev = list_prev(list, &marker);
+			list_remove(list, &marker);
+		} else
 			bufs_skipped += 1;
-		}
 	}
 	mutex_exit(&state->arcs_mtx);
 
@@ -1725,7 +1857,7 @@ top:
 	}
 
 	if (bytes_deleted < bytes)
-		dprintf("only deleted %lld bytes from %p",
+		dprintf("only deleted %lld bytes from %p\n",
 		    (longlong_t)bytes_deleted, state);
 }
 
@@ -1738,18 +1870,19 @@ arc_adjust(void)
 	 * Adjust MRU size
 	 */
 
-	adjustment = MIN(arc_size - arc_c,
-	    arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
+	adjustment = MIN((int64_t)(arc_size - arc_c),
+	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
+	    arc_p));
 
 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
-		(void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
+		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
 		adjustment -= delta;
 	}
 
 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
-		(void) arc_evict(arc_mru, NULL, delta, FALSE,
+		(void) arc_evict(arc_mru, 0, delta, FALSE,
 		    ARC_BUFC_METADATA);
 	}
 
@@ -1761,14 +1894,14 @@ arc_adjust(void)
 
 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
 		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
-		(void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
+		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
 		adjustment -= delta;
 	}
 
 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
 		int64_t delta = MIN(adjustment,
 		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
-		(void) arc_evict(arc_mfu, NULL, delta, FALSE,
+		(void) arc_evict(arc_mfu, 0, delta, FALSE,
 		    ARC_BUFC_METADATA);
 	}
 
@@ -1780,7 +1913,7 @@ arc_adjust(void)
 
 	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
 		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mru_ghost, NULL, delta);
+		arc_evict_ghost(arc_mru_ghost, 0, delta);
 	}
 
 	adjustment =
@@ -1788,7 +1921,7 @@ arc_adjust(void)
 
 	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
 		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mfu_ghost, NULL, delta);
+		arc_evict_ghost(arc_mfu_ghost, 0, delta);
 	}
 }
 
@@ -1799,9 +1932,9 @@ arc_do_user_evicts(void)
 	while (arc_eviction_list != NULL) {
 		arc_buf_t *buf = arc_eviction_list;
 		arc_eviction_list = buf->b_next;
-		rw_enter(&buf->b_lock, RW_WRITER);
+		mutex_enter(&buf->b_evict_lock);
 		buf->b_hdr = NULL;
-		rw_exit(&buf->b_lock);
+		mutex_exit(&buf->b_evict_lock);
 		mutex_exit(&arc_eviction_mtx);
 
 		if (buf->b_efunc != NULL)
@@ -1889,9 +2022,8 @@ arc_shrink(void)
 static int
 arc_reclaim_needed(void)
 {
-	uint64_t extra;
-
 #ifdef _KERNEL
+	uint64_t extra;
 
 	if (needfree)
 		return (1);
@@ -1953,14 +2085,16 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 	kmem_cache_t		*prev_data_cache = NULL;
 	extern kmem_cache_t	*zio_buf_cache[];
 	extern kmem_cache_t	*zio_data_buf_cache[];
-
 #ifdef _KERNEL
-	if (arc_meta_used >= arc_meta_limit) {
+	int			retry = 0;
+
+	while ((arc_meta_used >= arc_meta_limit) && (retry < 10)) {
 		/*
 		 * We are exceeding our meta-data cache limit.
 		 * Purge some DNLC entries to release holds on meta-data.
 		 */
 		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
+		retry++;
 	}
 #if defined(__i386)
 	/*
@@ -2017,26 +2151,28 @@ arc_reclaim_thread(void)
 			}
 
 			/* reset the growth delay for every reclaim */
-			growtime = lbolt + (arc_grow_retry * hz);
+			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
 
 			arc_kmem_reap_now(last_reclaim);
 			arc_warm = B_TRUE;
 
-		} else if (arc_no_grow && lbolt >= growtime) {
+		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
 			arc_no_grow = FALSE;
 		}
 
-		if (2 * arc_c < arc_size +
-		    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
-			arc_adjust();
+		/* Keep meta data usage within limits */
+		if (arc_meta_used >= arc_meta_limit)
+			arc_kmem_reap_now(ARC_RECLAIM_CONS);
+
+		arc_adjust();
 
 		if (arc_eviction_list != NULL)
 			arc_do_user_evicts();
 
 		/* block until needed, or one second, whichever is shorter */
 		CALLB_CPR_SAFE_BEGIN(&cpr);
-		(void) cv_timedwait(&arc_reclaim_thr_cv,
-		    &arc_reclaim_thr_lock, (lbolt + hz));
+		(void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
+		    &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
 	}
 
@@ -2046,6 +2182,59 @@ arc_reclaim_thread(void)
 	thread_exit();
 }
 
+#ifdef _KERNEL
+/*
+ * Under Linux the arc shrinker may be called for synchronous (direct)
+ * reclaim, or asynchronous (indirect) reclaim.  When called by kswapd
+ * for indirect reclaim we take a conservative approach and just reap
+ * free slabs from the ARC caches.  If this proves to be insufficient
+ * direct reclaim will be trigger.  In direct reclaim a more aggressive
+ * strategy is used, data is evicted from the ARC and free slabs reaped.
+ */
+static int
+__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
+{
+	arc_reclaim_strategy_t strategy;
+	int arc_reclaim;
+
+	/* Return number of reclaimable pages based on arc_shrink_shift */
+	arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
+	    >> arc_shrink_shift, 0);
+	if (sc->nr_to_scan == 0)
+		return (arc_reclaim);
+
+	/* Prevent reclaim below arc_c_min */
+	if (arc_reclaim <= 0)
+		return (-1);
+
+	/* Not allowed to perform filesystem reclaim */
+	if (!(sc->gfp_mask & __GFP_FS))
+		return (-1);
+
+	/* Reclaim in progress */
+	if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
+		return (-1);
+
+	if (current_is_kswapd()) {
+		strategy = ARC_RECLAIM_CONS;
+		ARCSTAT_INCR(arcstat_memory_indirect_count, 1);
+	} else {
+		strategy = ARC_RECLAIM_AGGR;
+		ARCSTAT_INCR(arcstat_memory_direct_count, 1);
+	}
+
+	arc_kmem_reap_now(strategy);
+	arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
+	    >> arc_shrink_shift, 0);
+	mutex_exit(&arc_reclaim_thr_lock);
+
+	return (arc_reclaim);
+}
+SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
+
+SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
+#endif /* _KERNEL */
+
 /*
  * Adapt arc info given the number of bytes we are trying to add and
  * the state that we are comming from.  This function is only called
@@ -2072,6 +2261,7 @@ arc_adapt(int bytes, arc_state_t *state)
 	if (state == arc_mru_ghost) {
 		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
 		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
 
 		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
 	} else if (state == arc_mfu_ghost) {
@@ -2079,6 +2269,7 @@ arc_adapt(int bytes, arc_state_t *state)
 
 		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
 		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+		mult = MIN(mult, 10);
 
 		delta = MIN(bytes * mult, arc_p);
 		arc_p = MAX(arc_p_min, arc_p - delta);
@@ -2208,7 +2399,7 @@ arc_get_data_buf(arc_buf_t *buf)
 		state =  (arc_mru->arcs_lsize[type] >= size &&
 		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
 	}
-	if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
+	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
 		if (type == ARC_BUFC_METADATA) {
 			buf->b_data = zio_buf_alloc(size);
 			arc_space_consume(size, ARC_SPACE_DATA);
@@ -2251,6 +2442,8 @@ out:
 static void
 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 {
+	clock_t now;
+
 	ASSERT(MUTEX_HELD(hash_lock));
 
 	if (buf->b_state == arc_anon) {
@@ -2261,11 +2454,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 */
 
 		ASSERT(buf->b_arc_access == 0);
-		buf->b_arc_access = lbolt;
+		buf->b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
 		arc_change_state(arc_mru, buf, hash_lock);
 
 	} else if (buf->b_state == arc_mru) {
+		now = ddi_get_lbolt();
+
 		/*
 		 * If this buffer is here because of a prefetch, then either:
 		 * - clear the flag if this is a "referencing" read
@@ -2281,7 +2476,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 				buf->b_flags &= ~ARC_PREFETCH;
 				ARCSTAT_BUMP(arcstat_mru_hits);
 			}
-			buf->b_arc_access = lbolt;
+			buf->b_arc_access = now;
 			return;
 		}
 
@@ -2290,13 +2485,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 * but it is still in the cache. Move it to the MFU
 		 * state.
 		 */
-		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
+		if (now > buf->b_arc_access + ARC_MINTIME) {
 			/*
 			 * More than 125ms have passed since we
 			 * instantiated this buffer.  Move it to the
 			 * most frequently used state.
 			 */
-			buf->b_arc_access = lbolt;
+			buf->b_arc_access = now;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 			arc_change_state(arc_mfu, buf, hash_lock);
 		}
@@ -2319,7 +2514,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 		}
 
-		buf->b_arc_access = lbolt;
+		buf->b_arc_access = ddi_get_lbolt();
 		arc_change_state(new_state, buf, hash_lock);
 
 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
@@ -2338,7 +2533,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 			ASSERT(list_link_active(&buf->b_arc_node));
 		}
 		ARCSTAT_BUMP(arcstat_mfu_hits);
-		buf->b_arc_access = lbolt;
+		buf->b_arc_access = ddi_get_lbolt();
 	} else if (buf->b_state == arc_mfu_ghost) {
 		arc_state_t	*new_state = arc_mfu;
 		/*
@@ -2356,7 +2551,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 			new_state = arc_mru;
 		}
 
-		buf->b_arc_access = lbolt;
+		buf->b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 		arc_change_state(new_state, buf, hash_lock);
 
@@ -2366,7 +2561,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 * This buffer is on the 2nd Level ARC.
 		 */
 
-		buf->b_arc_access = lbolt;
+		buf->b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 		arc_change_state(arc_mfu, buf, hash_lock);
 	} else {
@@ -2379,7 +2574,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 void
 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
-	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+	if (zio == NULL || zio->io_error == 0)
+		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
 	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
 }
 
@@ -2393,6 +2589,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
 		*bufp = NULL;
 	} else {
 		*bufp = buf;
+		ASSERT(buf->b_data);
 	}
 }
 
@@ -2431,7 +2628,7 @@ arc_read_done(zio_t *zio)
 	/* byteswap if necessary */
 	callback_list = hdr->b_acb;
 	ASSERT(callback_list != NULL);
-	if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
 		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
 		    byteswap_uint64_array :
 		    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
@@ -2440,6 +2637,16 @@ arc_read_done(zio_t *zio)
 
 	arc_cksum_compute(buf, B_FALSE);
 
+	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+		/*
+		 * Only call arc_access on anonymous buffers.  This is because
+		 * if we've issued an I/O for an evicted buffer, we've already
+		 * called arc_access (to prevent any simultaneous readers from
+		 * getting confused).
+		 */
+		arc_access(hdr, hash_lock);
+	}
+
 	/* create copies of the data buffer for the callers */
 	abuf = buf;
 	for (acb = callback_list; acb; acb = acb->acb_next) {
@@ -2453,8 +2660,11 @@ arc_read_done(zio_t *zio)
 	hdr->b_acb = NULL;
 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 	ASSERT(!HDR_BUF_AVAILABLE(hdr));
-	if (abuf == buf)
+	if (abuf == buf) {
+		ASSERT(buf->b_efunc == NULL);
+		ASSERT(hdr->b_datacnt == 1);
 		hdr->b_flags |= ARC_BUF_AVAILABLE;
+	}
 
 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
 
@@ -2475,14 +2685,6 @@ arc_read_done(zio_t *zio)
 	cv_broadcast(&hdr->b_cv);
 
 	if (hash_lock) {
-		/*
-		 * Only call arc_access on anonymous buffers.  This is because
-		 * if we've issued an I/O for an evicted buffer, we've already
-		 * called arc_access (to prevent any simultaneous readers from
-		 * getting confused).
-		 */
-		if (zio->io_error == 0 && hdr->b_state == arc_anon)
-			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
 	} else {
 		/*
@@ -2536,36 +2738,46 @@ arc_read_done(zio_t *zio)
  * arc_read_bp.
  */
 int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb)
 {
 	int err;
 
+	if (pbuf == NULL) {
+		/*
+		 * XXX This happens from traverse callback funcs, for
+		 * the objset_phys_t block.
+		 */
+		return (arc_read_nolock(pio, spa, bp, done, private, priority,
+		    zio_flags, arc_flags, zb));
+	}
+
 	ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
 	ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
-	rw_enter(&pbuf->b_lock, RW_READER);
+	rw_enter(&pbuf->b_data_lock, RW_READER);
 
 	err = arc_read_nolock(pio, spa, bp, done, private, priority,
 	    zio_flags, arc_flags, zb);
-	rw_exit(&pbuf->b_lock);
+	rw_exit(&pbuf->b_data_lock);
 
 	return (err);
 }
 
 int
-arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr;
-	arc_buf_t *buf;
+	arc_buf_t *buf = NULL;
 	kmutex_t *hash_lock;
 	zio_t *rzio;
 	uint64_t guid = spa_guid(spa);
 
 top:
-	hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+	    &hash_lock);
 	if (hdr && hdr->b_datacnt > 0) {
 
 		*arc_flags |= ARC_CACHED;
@@ -2583,7 +2795,7 @@ top:
 				arc_callback_t	*acb = NULL;
 
 				acb = kmem_zalloc(sizeof (arc_callback_t),
-				    KM_SLEEP);
+				    KM_PUSHPAGE);
 				acb->acb_done = done;
 				acb->acb_private = private;
 				if (pio != NULL)
@@ -2619,6 +2831,7 @@ top:
 			} else {
 				buf = arc_buf_clone(buf);
 			}
+
 		} else if (*arc_flags & ARC_PREFETCH &&
 		    refcount_count(&hdr->b_refcnt) == 0) {
 			hdr->b_flags |= ARC_PREFETCH;
@@ -2639,7 +2852,7 @@ top:
 		uint64_t size = BP_GET_LSIZE(bp);
 		arc_callback_t	*acb;
 		vdev_t *vd = NULL;
-		uint64_t addr;
+		uint64_t addr = -1;
 		boolean_t devw = B_FALSE;
 
 		if (hdr == NULL) {
@@ -2649,15 +2862,13 @@ top:
 			buf = arc_buf_alloc(spa, size, private, type);
 			hdr = buf->b_hdr;
 			hdr->b_dva = *BP_IDENTITY(bp);
-			hdr->b_birth = bp->blk_birth;
+			hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
 			exists = buf_hash_insert(hdr, &hash_lock);
 			if (exists) {
 				/* somebody beat us to the hash insert */
 				mutex_exit(hash_lock);
-				bzero(&hdr->b_dva, sizeof (dva_t));
-				hdr->b_birth = 0;
-				hdr->b_cksum0 = 0;
+				buf_discard_identity(hdr);
 				(void) arc_buf_remove_ref(buf, private);
 				goto top; /* restart the IO request */
 			}
@@ -2692,13 +2903,15 @@ top:
 			buf->b_private = NULL;
 			buf->b_next = NULL;
 			hdr->b_buf = buf;
-			arc_get_data_buf(buf);
 			ASSERT(hdr->b_datacnt == 0);
 			hdr->b_datacnt = 1;
-
+			arc_get_data_buf(buf);
+			arc_access(hdr, hash_lock);
 		}
 
-		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+		ASSERT(!GHOST_STATE(hdr->b_state));
+
+		acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE);
 		acb->acb_done = done;
 		acb->acb_private = private;
 
@@ -2706,17 +2919,6 @@ top:
 		hdr->b_acb = acb;
 		hdr->b_flags |= ARC_IO_IN_PROGRESS;
 
-		/*
-		 * If the buffer has been evicted, migrate it to a present state
-		 * before issuing the I/O.  Once we drop the hash-table lock,
-		 * the header will be marked as I/O in progress and have an
-		 * attached buffer.  At this point, anybody who finds this
-		 * buffer ought to notice that it's legit but has a pending I/O.
-		 */
-
-		if (GHOST_STATE(hdr->b_state))
-			arc_access(hdr, hash_lock);
-
 		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
 		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
 			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
@@ -2732,8 +2934,8 @@ top:
 		mutex_exit(hash_lock);
 
 		ASSERT3U(hdr->b_size, ==, size);
-		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
-		    zbookmark_t *, zb);
+		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
+		    uint64_t, size, zbookmark_t *, zb);
 		ARCSTAT_BUMP(arcstat_misses);
 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
@@ -2758,7 +2960,7 @@ top:
 				ARCSTAT_BUMP(arcstat_l2_hits);
 
 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
-				    KM_SLEEP);
+				    KM_PUSHPAGE);
 				cb->l2rcb_buf = buf;
 				cb->l2rcb_spa = spa;
 				cb->l2rcb_bp = *bp;
@@ -2819,47 +3021,15 @@ top:
 	return (0);
 }
 
-/*
- * arc_read() variant to support pool traversal.  If the block is already
- * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
- * The idea is that we don't want pool traversal filling up memory, but
- * if the ARC already has the data anyway, we shouldn't pay for the I/O.
- */
-int
-arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
-{
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_mtx;
-	uint64_t guid = spa_guid(spa);
-	int rc = 0;
-
-	hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
-
-	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
-		arc_buf_t *buf = hdr->b_buf;
-
-		ASSERT(buf);
-		while (buf->b_data == NULL) {
-			buf = buf->b_next;
-			ASSERT(buf);
-		}
-		bcopy(buf->b_data, data, hdr->b_size);
-	} else {
-		rc = ENOENT;
-	}
-
-	if (hash_mtx)
-		mutex_exit(hash_mtx);
-
-	return (rc);
-}
-
 void
 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
 {
 	ASSERT(buf->b_hdr != NULL);
 	ASSERT(buf->b_hdr->b_state != arc_anon);
 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+	ASSERT(buf->b_efunc == NULL);
+	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
+
 	buf->b_efunc = func;
 	buf->b_private = private;
 }
@@ -2876,14 +3046,14 @@ arc_buf_evict(arc_buf_t *buf)
 	kmutex_t *hash_lock;
 	arc_buf_t **bufp;
 
-	rw_enter(&buf->b_lock, RW_WRITER);
+	mutex_enter(&buf->b_evict_lock);
 	hdr = buf->b_hdr;
 	if (hdr == NULL) {
 		/*
 		 * We are in arc_do_user_evicts().
 		 */
 		ASSERT(buf->b_data == NULL);
-		rw_exit(&buf->b_lock);
+		mutex_exit(&buf->b_evict_lock);
 		return (0);
 	} else if (buf->b_data == NULL) {
 		arc_buf_t copy = *buf; /* structure assignment */
@@ -2892,14 +3062,15 @@ arc_buf_evict(arc_buf_t *buf)
 		 * but let arc_do_user_evicts() do the reaping.
 		 */
 		buf->b_efunc = NULL;
-		rw_exit(&buf->b_lock);
+		mutex_exit(&buf->b_evict_lock);
 		VERIFY(copy.b_efunc(&copy) == 0);
 		return (1);
 	}
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
+	hdr = buf->b_hdr;
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
-	ASSERT(buf->b_hdr == hdr);
 	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
 
@@ -2918,6 +3089,7 @@ arc_buf_evict(arc_buf_t *buf)
 		arc_state_t *old_state = hdr->b_state;
 		arc_state_t *evicted_state;
 
+		ASSERT(hdr->b_buf == NULL);
 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
 
 		evicted_state =
@@ -2935,12 +3107,13 @@ arc_buf_evict(arc_buf_t *buf)
 		mutex_exit(&old_state->arcs_mtx);
 	}
 	mutex_exit(hash_lock);
-	rw_exit(&buf->b_lock);
+	mutex_exit(&buf->b_evict_lock);
 
 	VERIFY(buf->b_efunc(buf) == 0);
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 	buf->b_hdr = NULL;
+	buf->b_next = NULL;
 	kmem_cache_free(buf_cache, buf);
 	return (1);
 }
@@ -2955,29 +3128,30 @@ void
 arc_release(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_lock;
+	kmutex_t *hash_lock = NULL;
 	l2arc_buf_hdr_t *l2hdr;
-	uint64_t buf_size;
-	boolean_t released = B_FALSE;
+	uint64_t buf_size = 0;
 
-	rw_enter(&buf->b_lock, RW_WRITER);
+	/*
+	 * It would be nice to assert that if it's DMU metadata (level >
+	 * 0 || it's the dnode file), then it must be syncing context.
+	 * But we don't know that information at this level.
+	 */
+
+	mutex_enter(&buf->b_evict_lock);
 	hdr = buf->b_hdr;
 
 	/* this buffer is not on any list */
 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
-	ASSERT(!(hdr->b_flags & ARC_STORED));
 
 	if (hdr->b_state == arc_anon) {
 		/* this buffer is already released */
-		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
-		ASSERT(BUF_EMPTY(hdr));
 		ASSERT(buf->b_efunc == NULL);
-		arc_buf_thaw(buf);
-		rw_exit(&buf->b_lock);
-		released = B_TRUE;
 	} else {
 		hash_lock = HDR_LOCK(hdr);
 		mutex_enter(hash_lock);
+		hdr = buf->b_hdr;
+		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	}
 
 	l2hdr = hdr->b_l2hdr;
@@ -2987,9 +3161,6 @@ arc_release(arc_buf_t *buf, void *tag)
 		buf_size = hdr->b_size;
 	}
 
-	if (released)
-		goto out;
-
 	/*
 	 * Do we have more than one buf?
 	 */
@@ -3003,14 +3174,14 @@ arc_release(arc_buf_t *buf, void *tag)
 
 		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
 		/*
-		 * Pull the data off of this buf and attach it to
-		 * a new anonymous buf.
+		 * Pull the data off of this hdr and attach it to
+		 * a new anonymous hdr.
 		 */
 		(void) remove_reference(hdr, hash_lock, tag);
 		bufp = &hdr->b_buf;
 		while (*bufp != buf)
 			bufp = &(*bufp)->b_next;
-		*bufp = (*bufp)->b_next;
+		*bufp = buf->b_next;
 		buf->b_next = NULL;
 
 		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
@@ -3038,26 +3209,25 @@ arc_release(arc_buf_t *buf, void *tag)
 		nhdr->b_freeze_cksum = NULL;
 		(void) refcount_add(&nhdr->b_refcnt, tag);
 		buf->b_hdr = nhdr;
-		rw_exit(&buf->b_lock);
+		mutex_exit(&buf->b_evict_lock);
 		atomic_add_64(&arc_anon->arcs_size, blksz);
 	} else {
-		rw_exit(&buf->b_lock);
+		mutex_exit(&buf->b_evict_lock);
 		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
 		ASSERT(!list_link_active(&hdr->b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-		arc_change_state(arc_anon, hdr, hash_lock);
+		if (hdr->b_state != arc_anon)
+			arc_change_state(arc_anon, hdr, hash_lock);
 		hdr->b_arc_access = 0;
-		mutex_exit(hash_lock);
+		if (hash_lock)
+			mutex_exit(hash_lock);
 
-		bzero(&hdr->b_dva, sizeof (dva_t));
-		hdr->b_birth = 0;
-		hdr->b_cksum0 = 0;
+		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 	}
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 
-out:
 	if (l2hdr) {
 		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
 		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
@@ -3066,14 +3236,27 @@ out:
 	}
 }
 
+/*
+ * Release this buffer.  If it does not match the provided BP, fill it
+ * with that block's contents.
+ */
+/* ARGSUSED */
+int
+arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+    zbookmark_t *zb)
+{
+	arc_release(buf, tag);
+	return (0);
+}
+
 int
 arc_released(arc_buf_t *buf)
 {
 	int released;
 
-	rw_enter(&buf->b_lock, RW_READER);
+	mutex_enter(&buf->b_evict_lock);
 	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
-	rw_exit(&buf->b_lock);
+	mutex_exit(&buf->b_evict_lock);
 	return (released);
 }
 
@@ -3082,9 +3265,9 @@ arc_has_callback(arc_buf_t *buf)
 {
 	int callback;
 
-	rw_enter(&buf->b_lock, RW_READER);
+	mutex_enter(&buf->b_evict_lock);
 	callback = (buf->b_efunc != NULL);
-	rw_exit(&buf->b_lock);
+	mutex_exit(&buf->b_evict_lock);
 	return (callback);
 }
 
@@ -3094,9 +3277,9 @@ arc_referenced(arc_buf_t *buf)
 {
 	int referenced;
 
-	rw_enter(&buf->b_lock, RW_READER);
+	mutex_enter(&buf->b_evict_lock);
 	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
-	rw_exit(&buf->b_lock);
+	mutex_exit(&buf->b_evict_lock);
 	return (referenced);
 }
 #endif
@@ -3136,21 +3319,28 @@ arc_write_done(zio_t *zio)
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	hdr->b_acb = NULL;
+	ASSERT(hdr->b_acb == NULL);
+
+	if (zio->io_error == 0) {
+		hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+		hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+		hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+	} else {
+		ASSERT(BUF_EMPTY(hdr));
+	}
 
-	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-	hdr->b_birth = zio->io_bp->blk_birth;
-	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
 	/*
 	 * If the block to be written was all-zero, we may have
 	 * compressed it away.  In this case no write was performed
-	 * so there will be no dva/birth-date/checksum.  The buffer
-	 * must therefor remain anonymous (and uncached).
+	 * so there will be no dva/birth/checksum.  The buffer must
+	 * therefore remain anonymous (and uncached).
 	 */
 	if (!BUF_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
+		ASSERT(zio->io_error == 0);
+
 		arc_cksum_verify(buf);
 
 		exists = buf_hash_insert(hdr, &hash_lock);
@@ -3160,106 +3350,54 @@ arc_write_done(zio_t *zio)
 			 * sync-to-convergence, because we remove
 			 * buffers from the hash table when we arc_free().
 			 */
-			ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
-			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
-			    BP_IDENTITY(zio->io_bp)));
-			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
-			    zio->io_bp->blk_birth);
-
-			ASSERT(refcount_is_zero(&exists->b_refcnt));
-			arc_change_state(arc_anon, exists, hash_lock);
-			mutex_exit(hash_lock);
-			arc_hdr_destroy(exists);
-			exists = buf_hash_insert(hdr, &hash_lock);
-			ASSERT3P(exists, ==, NULL);
+			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+					panic("bad overwrite, hdr=%p exists=%p",
+					    (void *)hdr, (void *)exists);
+				ASSERT(refcount_is_zero(&exists->b_refcnt));
+				arc_change_state(arc_anon, exists, hash_lock);
+				mutex_exit(hash_lock);
+				arc_hdr_destroy(exists);
+				exists = buf_hash_insert(hdr, &hash_lock);
+				ASSERT3P(exists, ==, NULL);
+			} else {
+				/* Dedup */
+				ASSERT(hdr->b_datacnt == 1);
+				ASSERT(hdr->b_state == arc_anon);
+				ASSERT(BP_GET_DEDUP(zio->io_bp));
+				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+			}
 		}
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 		/* if it's not anon, we are doing a scrub */
-		if (hdr->b_state == arc_anon)
+		if (!exists && hdr->b_state == arc_anon)
 			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
-	} else if (callback->awcb_done == NULL) {
-		int destroy_hdr;
-		/*
-		 * This is an anonymous buffer with no user callback,
-		 * destroy it if there are no active references.
-		 */
-		mutex_enter(&arc_eviction_mtx);
-		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
-		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-		mutex_exit(&arc_eviction_mtx);
-		if (destroy_hdr)
-			arc_hdr_destroy(hdr);
 	} else {
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 	}
-	hdr->b_flags &= ~ARC_STORED;
 
-	if (callback->awcb_done) {
-		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
-		callback->awcb_done(zio, buf, callback->awcb_private);
-	}
+	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
-void
-write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
-{
-	boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
-
-	/* Determine checksum setting */
-	if (ismd) {
-		/*
-		 * Metadata always gets checksummed.  If the data
-		 * checksum is multi-bit correctable, and it's not a
-		 * ZBT-style checksum, then it's suitable for metadata
-		 * as well.  Otherwise, the metadata checksum defaults
-		 * to fletcher4.
-		 */
-		if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
-		    !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
-			zp->zp_checksum = wp->wp_oschecksum;
-		else
-			zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
-	} else {
-		zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
-		    wp->wp_oschecksum);
-	}
-
-	/* Determine compression setting */
-	if (ismd) {
-		/*
-		 * XXX -- we should design a compression algorithm
-		 * that specializes in arrays of bps.
-		 */
-		zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
-		    ZIO_COMPRESS_LZJB;
-	} else {
-		zp->zp_compress = zio_compress_select(wp->wp_dncompress,
-		    wp->wp_oscompress);
-	}
-
-	zp->zp_type = wp->wp_type;
-	zp->zp_level = wp->wp_level;
-	zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
-}
-
 zio_t *
-arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
-    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int zio_flags, const zbookmark_t *zb)
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+    arc_done_func_t *ready, arc_done_func_t *done, void *private,
+    int priority, int zio_flags, const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
-	zio_prop_t zp;
 
 	ASSERT(ready != NULL);
+	ASSERT(done != NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
-	ASSERT(hdr->b_acb == 0);
+	ASSERT(hdr->b_acb == NULL);
 	if (l2arc)
 		hdr->b_flags |= ARC_L2CACHE;
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
@@ -3268,88 +3406,12 @@ arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
-	write_policy(spa, wp, &zp);
-	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
 	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
 
 	return (zio);
 }
 
-int
-arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags)
-{
-	arc_buf_hdr_t *ab;
-	kmutex_t *hash_lock;
-	zio_t	*zio;
-	uint64_t guid = spa_guid(spa);
-
-	/*
-	 * If this buffer is in the cache, release it, so it
-	 * can be re-used.
-	 */
-	ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
-	if (ab != NULL) {
-		/*
-		 * The checksum of blocks to free is not always
-		 * preserved (eg. on the deadlist).  However, if it is
-		 * nonzero, it should match what we have in the cache.
-		 */
-		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
-		    bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
-		    bp->blk_fill == BLK_FILL_ALREADY_FREED);
-
-		if (ab->b_state != arc_anon)
-			arc_change_state(arc_anon, ab, hash_lock);
-		if (HDR_IO_IN_PROGRESS(ab)) {
-			/*
-			 * This should only happen when we prefetch.
-			 */
-			ASSERT(ab->b_flags & ARC_PREFETCH);
-			ASSERT3U(ab->b_datacnt, ==, 1);
-			ab->b_flags |= ARC_FREED_IN_READ;
-			if (HDR_IN_HASH_TABLE(ab))
-				buf_hash_remove(ab);
-			ab->b_arc_access = 0;
-			bzero(&ab->b_dva, sizeof (dva_t));
-			ab->b_birth = 0;
-			ab->b_cksum0 = 0;
-			ab->b_buf->b_efunc = NULL;
-			ab->b_buf->b_private = NULL;
-			mutex_exit(hash_lock);
-		} else if (refcount_is_zero(&ab->b_refcnt)) {
-			ab->b_flags |= ARC_FREE_IN_PROGRESS;
-			mutex_exit(hash_lock);
-			arc_hdr_destroy(ab);
-			ARCSTAT_BUMP(arcstat_deleted);
-		} else {
-			/*
-			 * We still have an active reference on this
-			 * buffer.  This can happen, e.g., from
-			 * dbuf_unoverride().
-			 */
-			ASSERT(!HDR_IN_HASH_TABLE(ab));
-			ab->b_arc_access = 0;
-			bzero(&ab->b_dva, sizeof (dva_t));
-			ab->b_birth = 0;
-			ab->b_cksum0 = 0;
-			ab->b_buf->b_efunc = NULL;
-			ab->b_buf->b_private = NULL;
-			mutex_exit(hash_lock);
-		}
-	}
-
-	zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
-
-	if (arc_flags & ARC_WAIT)
-		return (zio_wait(zio));
-
-	ASSERT(arc_flags & ARC_NOWAIT);
-	zio_nowait(zio);
-
-	return (0);
-}
-
 static int
 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
 {
@@ -3443,7 +3505,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 	 * in order to compress/encrypt/etc the data.  We therefor need to
 	 * make sure that there is sufficient available memory for this.
 	 */
-	if (error = arc_memory_throttle(reserve, anon_size, txg))
+	if ((error = arc_memory_throttle(reserve, anon_size, txg)))
 		return (error);
 
 	/*
@@ -3487,16 +3549,22 @@ arc_init(void)
 	 * need to limit the cache to 1/8 of VM size.
 	 */
 	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+	/*
+	 * Register a shrinker to support synchronous (direct) memory
+	 * reclaim from the arc.  This is done to prevent kswapd from
+	 * swapping out pages when it is preferable to shrink the arc.
+	 */
+	spl_register_shrinker(&arc_shrinker);
 #endif
 
 	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
 	arc_c_min = MAX(arc_c / 4, 64<<20);
-	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
-	if (arc_c * 8 >= 1<<30)
-		arc_c_max = (arc_c * 8) - (1<<30);
+	/* set max to 1/2 of all memory, or all but 4GB, whichever is more */
+	if (arc_c * 8 >= ((uint64_t)4<<30))
+		arc_c_max = (arc_c * 8) - ((uint64_t)4<<30);
 	else
 		arc_c_max = arc_c_min;
-	arc_c_max = MAX(arc_c * 6, arc_c_max);
+	arc_c_max = MAX(arc_c * 4, arc_c_max);
 
 	/*
 	 * Allow the tunables to override our calculations if they are
@@ -3512,6 +3580,7 @@ arc_init(void)
 
 	/* limit meta-data to 1/4 of the arc capacity */
 	arc_meta_limit = arc_c_max / 4;
+	arc_meta_max = 0;
 
 	/* Allow the tunable to override if it is reasonable */
 	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
@@ -3529,6 +3598,9 @@ arc_init(void)
 	if (zfs_arc_p_min_shift > 0)
 		arc_p_min_shift = zfs_arc_p_min_shift;
 
+	if (zfs_arc_reduce_dnlc_percent > 0)
+		arc_reduce_dnlc_percent = zfs_arc_reduce_dnlc_percent;
+
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
@@ -3603,6 +3675,10 @@ void
 arc_fini(void)
 {
 	mutex_enter(&arc_reclaim_thr_lock);
+#ifdef _KERNEL
+	spl_unregister_shrinker(&arc_shrinker);
+#endif /* _KERNEL */
+
 	arc_thread_exit = 1;
 	while (arc_thread_exit != 0)
 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
@@ -3786,12 +3862,11 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
 	/*
 	 * A buffer is *not* eligible for the L2ARC if it:
 	 * 1. belongs to a different spa.
-	 * 2. has no attached buffer.
-	 * 3. is already cached on the L2ARC.
-	 * 4. has an I/O in progress (it may be an incomplete read).
-	 * 5. is flagged not eligible (zfs property).
+	 * 2. is already cached on the L2ARC.
+	 * 3. has an I/O in progress (it may be an incomplete read).
+	 * 4. is flagged not eligible (zfs property).
 	 */
-	if (ab->b_spa != spa_guid || ab->b_buf == NULL || ab->b_l2hdr != NULL ||
+	if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
 	    HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
 		return (B_FALSE);
 
@@ -3815,7 +3890,7 @@ l2arc_write_size(l2arc_dev_t *dev)
 static clock_t
 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 {
-	clock_t interval, next;
+	clock_t interval, next, now;
 
 	/*
 	 * If the ARC lists are busy, increase our write rate; if the
@@ -3828,7 +3903,8 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 	else
 		interval = hz * l2arc_feed_secs;
 
-	next = MAX(lbolt, MIN(lbolt + interval, began + interval));
+	now = ddi_get_lbolt();
+	next = MAX(now, MIN(now + interval, began + interval));
 
 	return (next);
 }
@@ -3912,7 +3988,7 @@ out:
  * Free buffers that were tagged for destruction.
  */
 static void
-l2arc_do_free_on_write()
+l2arc_do_free_on_write(void)
 {
 	list_t *buflist;
 	l2arc_data_free_t *df, *df_prev;
@@ -4030,11 +4106,11 @@ l2arc_read_done(zio_t *zio)
 	ASSERT(cb != NULL);
 	buf = cb->l2rcb_buf;
 	ASSERT(buf != NULL);
-	hdr = buf->b_hdr;
-	ASSERT(hdr != NULL);
 
-	hash_lock = HDR_LOCK(hdr);
+	hash_lock = HDR_LOCK(buf->b_hdr);
 	mutex_enter(hash_lock);
+	hdr = buf->b_hdr;
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	/*
 	 * Check this survived the L2ARC journey.
@@ -4092,7 +4168,7 @@ l2arc_read_done(zio_t *zio)
 static list_t *
 l2arc_list_locked(int list_num, kmutex_t **lock)
 {
-	list_t *list;
+	list_t *list = NULL;
 
 	ASSERT(list_num >= 0 && list_num <= 3);
 
@@ -4247,7 +4323,7 @@ top:
 	}
 	mutex_exit(&l2arc_buflist_mtx);
 
-	spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+	vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
 	dev->l2ad_evict = taddr;
 }
 
@@ -4265,11 +4341,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	list_t *list;
 	uint64_t passed_sz, write_sz, buf_sz, headroom;
 	void *buf_data;
-	kmutex_t *hash_lock, *list_lock;
+	kmutex_t *hash_lock, *list_lock = NULL;
 	boolean_t have_lock, full;
 	l2arc_write_callback_t *cb;
 	zio_t *pio, *wzio;
 	uint64_t guid = spa_guid(spa);
+	int try;
 
 	ASSERT(dev->l2ad_vdev != NULL);
 
@@ -4283,7 +4360,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	 * Copy buffers for L2ARC writing.
 	 */
 	mutex_enter(&l2arc_buflist_mtx);
-	for (int try = 0; try <= 3; try++) {
+	for (try = 0; try <= 3; try++) {
 		list = l2arc_list_locked(try, &list_lock);
 		passed_sz = 0;
 
@@ -4407,15 +4484,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
 	ARCSTAT_INCR(arcstat_l2_size, write_sz);
-	spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+	vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
 
 	/*
 	 * Bump device hand to the device start if it is approaching the end.
 	 * l2arc_evict() will already have evicted ahead for this case.
 	 */
 	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
-		spa_l2cache_space_update(dev->l2ad_vdev, 0,
-		    dev->l2ad_end - dev->l2ad_hand);
+		vdev_space_update(dev->l2ad_vdev,
+		    dev->l2ad_end - dev->l2ad_hand, 0, 0);
 		dev->l2ad_hand = dev->l2ad_start;
 		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
@@ -4439,7 +4516,7 @@ l2arc_feed_thread(void)
 	l2arc_dev_t *dev;
 	spa_t *spa;
 	uint64_t size, wrote;
-	clock_t begin, next = lbolt;
+	clock_t begin, next = ddi_get_lbolt();
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
@@ -4447,10 +4524,10 @@ l2arc_feed_thread(void)
 
 	while (l2arc_thread_exit == 0) {
 		CALLB_CPR_SAFE_BEGIN(&cpr);
-		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
-		    next);
+		(void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
+		    &l2arc_feed_thr_lock, next);
 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
-		next = lbolt + hz;
+		next = ddi_get_lbolt() + hz;
 
 		/*
 		 * Quick check for L2ARC devices.
@@ -4461,7 +4538,7 @@ l2arc_feed_thread(void)
 			continue;
 		}
 		mutex_exit(&l2arc_dev_mtx);
-		begin = lbolt;
+		begin = ddi_get_lbolt();
 
 		/*
 		 * This selects the next l2arc device to write to, and in
@@ -4480,6 +4557,16 @@ l2arc_feed_thread(void)
 		ASSERT(spa != NULL);
 
 		/*
+		 * If the pool is read-only then force the feed thread to
+		 * sleep a little longer.
+		 */
+		if (!spa_writeable(spa)) {
+			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+			spa_config_exit(spa, SCL_L2ARC, dev);
+			continue;
+		}
+
+		/*
 		 * Avoid contributing to memory pressure.
 		 */
 		if (arc_reclaim_needed()) {
@@ -4556,6 +4643,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
+	list_link_init(&adddev->l2ad_node);
 	ASSERT3U(adddev->l2ad_write, >, 0);
 
 	/*
@@ -4566,7 +4654,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l2node));
 
-	spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 
 	/*
 	 * Add device to global list
@@ -4681,3 +4769,55 @@ l2arc_stop(void)
 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
 	mutex_exit(&l2arc_feed_thr_lock);
 }
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(arc_read);
+EXPORT_SYMBOL(arc_buf_remove_ref);
+EXPORT_SYMBOL(arc_getbuf_func);
+
+module_param(zfs_arc_min, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
+
+module_param(zfs_arc_max, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
+
+module_param(zfs_arc_meta_limit, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
+
+module_param(zfs_arc_reduce_dnlc_percent, int, 0444);
+MODULE_PARM_DESC(zfs_arc_reduce_dnlc_percent, "Meta reclaim percentage");
+
+module_param(zfs_arc_grow_retry, int, 0444);
+MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
+
+module_param(zfs_arc_shrink_shift, int, 0444);
+MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
+
+module_param(zfs_arc_p_min_shift, int, 0444);
+MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
+
+module_param(l2arc_write_max, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
+
+module_param(l2arc_write_boost, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
+
+module_param(l2arc_headroom, ulong, 0444);
+MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
+
+module_param(l2arc_feed_secs, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
+
+module_param(l2arc_feed_min_ms, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
+
+module_param(l2arc_noprefetch, int, 0444);
+MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
+
+module_param(l2arc_feed_again, int, 0444);
+MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
+
+module_param(l2arc_norw, int, 0444);
+MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
+
+#endif