Update core ZFS code from build 121 to build 141.
[zfs.git] / module / zfs / arc.c
index d5e5aa5..8adb54d 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
 
 #include <sys/spa.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
+#include <zfs_fletcher.h>
 
 static kmutex_t                arc_reclaim_thr_lock;
 static kcondvar_t      arc_reclaim_thr_cv;     /* used to signal reclaim thr */
@@ -178,7 +177,6 @@ static boolean_t arc_warm;
 uint64_t zfs_arc_max;
 uint64_t zfs_arc_min;
 uint64_t zfs_arc_meta_limit = 0;
-int zfs_mdcomp_disable = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
@@ -249,6 +247,9 @@ typedef struct arc_stats {
        kstat_named_t arcstat_recycle_miss;
        kstat_named_t arcstat_mutex_miss;
        kstat_named_t arcstat_evict_skip;
+       kstat_named_t arcstat_evict_l2_cached;
+       kstat_named_t arcstat_evict_l2_eligible;
+       kstat_named_t arcstat_evict_l2_ineligible;
        kstat_named_t arcstat_hash_elements;
        kstat_named_t arcstat_hash_elements_max;
        kstat_named_t arcstat_hash_collisions;
@@ -302,6 +303,9 @@ static arc_stats_t arc_stats = {
        { "recycle_miss",               KSTAT_DATA_UINT64 },
        { "mutex_miss",                 KSTAT_DATA_UINT64 },
        { "evict_skip",                 KSTAT_DATA_UINT64 },
+       { "evict_l2_cached",            KSTAT_DATA_UINT64 },
+       { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
+       { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
        { "hash_elements",              KSTAT_DATA_UINT64 },
        { "hash_elements_max",          KSTAT_DATA_UINT64 },
        { "hash_collisions",            KSTAT_DATA_UINT64 },
@@ -341,7 +345,7 @@ static arc_stats_t arc_stats = {
 #define        ARCSTAT_INCR(stat, val) \
        atomic_add_64(&arc_stats.stat.value.ui64, (val));
 
-#define        ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
+#define        ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 #define        ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 
 #define        ARCSTAT_MAX(stat, val) {                                        \
@@ -375,7 +379,7 @@ static arc_stats_t arc_stats = {
        }
 
 kstat_t                        *arc_ksp;
-static arc_state_t     *arc_anon;
+static arc_state_t     *arc_anon;
 static arc_state_t     *arc_mru;
 static arc_state_t     *arc_mru_ghost;
 static arc_state_t     *arc_mfu;
@@ -432,6 +436,7 @@ struct arc_buf_hdr {
 
        kmutex_t                b_freeze_lock;
        zio_cksum_t             *b_freeze_cksum;
+       void                    *b_thawed;
 
        arc_buf_hdr_t           *b_hash_next;
        arc_buf_t               *b_buf;
@@ -468,6 +473,8 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 static int arc_evict_needed(arc_buf_contents_t type);
 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 
+static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
+
 #define        GHOST_STATE(state)      \
        ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
        (state) == arc_l2c_only)
@@ -490,7 +497,6 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 #define        ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
 #define        ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
 #define        ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
-#define        ARC_STORED              (1 << 19)       /* has been store()d to */
 
 #define        HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 #define        HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
@@ -539,8 +545,8 @@ static buf_hash_table_t buf_hash_table;
        (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 #define        BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define        BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define        HDR_LOCK(buf) \
-       (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+#define        HDR_LOCK(hdr) \
+       (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];
 
@@ -658,6 +664,15 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
        ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
        ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+       hdr->b_dva.dva_word[0] = 0;
+       hdr->b_dva.dva_word[1] = 0;
+       hdr->b_birth = 0;
+       hdr->b_cksum0 = 0;
+}
+
 static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 {
@@ -791,7 +806,8 @@ buf_cons(void *vbuf, void *unused, int kmflag)
        arc_buf_t *buf = vbuf;
 
        bzero(buf, sizeof (arc_buf_t));
-       rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
+       mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+       rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
        arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
        return (0);
@@ -807,6 +823,7 @@ hdr_dest(void *vbuf, void *unused)
 {
        arc_buf_hdr_t *buf = vbuf;
 
+       ASSERT(BUF_EMPTY(buf));
        refcount_destroy(&buf->b_refcnt);
        cv_destroy(&buf->b_cv);
        mutex_destroy(&buf->b_freeze_lock);
@@ -819,7 +836,8 @@ buf_dest(void *vbuf, void *unused)
 {
        arc_buf_t *buf = vbuf;
 
-       rw_destroy(&buf->b_lock);
+       mutex_destroy(&buf->b_evict_lock);
+       rw_destroy(&buf->b_data_lock);
        arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
@@ -934,6 +952,11 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
+       kmutex_t *hash_lock;
+
+       hash_lock = HDR_LOCK(buf->b_hdr);
+       mutex_enter(hash_lock);
+
        if (zfs_flags & ZFS_DEBUG_MODIFY) {
                if (buf->b_hdr->b_state != arc_anon)
                        panic("modifying non-anon buffer!");
@@ -947,18 +970,32 @@ arc_buf_thaw(arc_buf_t *buf)
                kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
                buf->b_hdr->b_freeze_cksum = NULL;
        }
+
+       if (zfs_flags & ZFS_DEBUG_MODIFY) {
+               if (buf->b_hdr->b_thawed)
+                       kmem_free(buf->b_hdr->b_thawed, 1);
+               buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+       }
+
        mutex_exit(&buf->b_hdr->b_freeze_lock);
+       mutex_exit(hash_lock);
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
+       kmutex_t *hash_lock;
+
        if (!(zfs_flags & ZFS_DEBUG_MODIFY))
                return;
 
+       hash_lock = HDR_LOCK(buf->b_hdr);
+       mutex_enter(hash_lock);
+
        ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
            buf->b_hdr->b_state == arc_anon);
        arc_cksum_compute(buf, B_FALSE);
+       mutex_exit(hash_lock);
 }
 
 static void
@@ -1030,6 +1067,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
        ASSERT(new_state != old_state);
        ASSERT(refcnt == 0 || ab->b_datacnt > 0);
        ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+       ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
 
        from_delta = to_delta = ab->b_datacnt * ab->b_size;
 
@@ -1050,7 +1088,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 
                        /*
                         * If prefetching out of the ghost cache,
-                        * we will have a non-null datacnt.
+                        * we will have a non-zero datacnt.
                         */
                        if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
                                /* ghost elements have a ghost size */
@@ -1086,9 +1124,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
        }
 
        ASSERT(!BUF_EMPTY(ab));
-       if (new_state == arc_anon) {
+       if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
                buf_hash_remove(ab);
-       }
 
        /* adjust state sizes */
        if (to_delta)
@@ -1232,14 +1269,29 @@ arc_return_buf(arc_buf_t *buf, void *tag)
 {
        arc_buf_hdr_t *hdr = buf->b_hdr;
 
-       ASSERT(hdr->b_state == arc_anon);
        ASSERT(buf->b_data != NULL);
-       VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
-       VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
+       (void) refcount_add(&hdr->b_refcnt, tag);
+       (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
 
        atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
 }
 
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+       arc_buf_hdr_t *hdr;
+
+       ASSERT(buf->b_data != NULL);
+       hdr = buf->b_hdr;
+       (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
+       (void) refcount_remove(&hdr->b_refcnt, tag);
+       buf->b_efunc = NULL;
+       buf->b_private = NULL;
+
+       atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+}
+
 static arc_buf_t *
 arc_buf_clone(arc_buf_t *from)
 {
@@ -1247,6 +1299,8 @@ arc_buf_clone(arc_buf_t *from)
        arc_buf_hdr_t *hdr = from->b_hdr;
        uint64_t size = hdr->b_size;
 
+       ASSERT(hdr->b_state != arc_anon);
+
        buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
        buf->b_hdr = hdr;
        buf->b_data = NULL;
@@ -1271,16 +1325,16 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
         * must verify b_data != NULL to know if the add_ref
         * was successful.
         */
-       rw_enter(&buf->b_lock, RW_READER);
+       mutex_enter(&buf->b_evict_lock);
        if (buf->b_data == NULL) {
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                return;
        }
-       hdr = buf->b_hdr;
-       ASSERT(hdr != NULL);
-       hash_lock = HDR_LOCK(hdr);
+       hash_lock = HDR_LOCK(buf->b_hdr);
        mutex_enter(hash_lock);
-       rw_exit(&buf->b_lock);
+       hdr = buf->b_hdr;
+       ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+       mutex_exit(&buf->b_evict_lock);
 
        ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
        add_reference(hdr, hash_lock, tag);
@@ -1328,6 +1382,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
                arc_buf_contents_t type = buf->b_hdr->b_type;
 
                arc_cksum_verify(buf);
+
                if (!recycle) {
                        if (type == ARC_BUFC_METADATA) {
                                arc_buf_data_free(buf->b_hdr, zio_buf_free,
@@ -1365,6 +1420,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
        for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
                continue;
        *bufp = buf->b_next;
+       buf->b_next = NULL;
 
        ASSERT(buf->b_efunc == NULL);
 
@@ -1379,55 +1435,55 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
        ASSERT(refcount_is_zero(&hdr->b_refcnt));
        ASSERT3P(hdr->b_state, ==, arc_anon);
        ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-       ASSERT(!(hdr->b_flags & ARC_STORED));
+       l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
 
-       if (hdr->b_l2hdr != NULL) {
-               if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
-                       /*
-                        * To prevent arc_free() and l2arc_evict() from
-                        * attempting to free the same buffer at the same time,
-                        * a FREE_IN_PROGRESS flag is given to arc_free() to
-                        * give it priority.  l2arc_evict() can't destroy this
-                        * header while we are waiting on l2arc_buflist_mtx.
-                        *
-                        * The hdr may be removed from l2ad_buflist before we
-                        * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
-                        */
+       if (l2hdr != NULL) {
+               boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+               /*
+                * To prevent arc_free() and l2arc_evict() from
+                * attempting to free the same buffer at the same time,
+                * a FREE_IN_PROGRESS flag is given to arc_free() to
+                * give it priority.  l2arc_evict() can't destroy this
+                * header while we are waiting on l2arc_buflist_mtx.
+                *
+                * The hdr may be removed from l2ad_buflist before we
+                * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+                */
+               if (!buflist_held) {
                        mutex_enter(&l2arc_buflist_mtx);
-                       if (hdr->b_l2hdr != NULL) {
-                               list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
-                                   hdr);
-                       }
-                       mutex_exit(&l2arc_buflist_mtx);
-               } else {
-                       list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+                       l2hdr = hdr->b_l2hdr;
                }
-               ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
-               kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
-               if (hdr->b_state == arc_l2c_only)
-                       l2arc_hdr_stat_remove();
-               hdr->b_l2hdr = NULL;
+
+               if (l2hdr != NULL) {
+                       list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+                       ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+                       kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+                       if (hdr->b_state == arc_l2c_only)
+                               l2arc_hdr_stat_remove();
+                       hdr->b_l2hdr = NULL;
+               }
+
+               if (!buflist_held)
+                       mutex_exit(&l2arc_buflist_mtx);
        }
 
        if (!BUF_EMPTY(hdr)) {
                ASSERT(!HDR_IN_HASH_TABLE(hdr));
-               bzero(&hdr->b_dva, sizeof (dva_t));
-               hdr->b_birth = 0;
-               hdr->b_cksum0 = 0;
+               buf_discard_identity(hdr);
        }
        while (hdr->b_buf) {
                arc_buf_t *buf = hdr->b_buf;
 
                if (buf->b_efunc) {
                        mutex_enter(&arc_eviction_mtx);
-                       rw_enter(&buf->b_lock, RW_WRITER);
+                       mutex_enter(&buf->b_evict_lock);
                        ASSERT(buf->b_hdr != NULL);
                        arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
                        hdr->b_buf = buf->b_next;
                        buf->b_hdr = &arc_eviction_hdr;
                        buf->b_next = arc_eviction_list;
                        arc_eviction_list = buf;
-                       rw_exit(&buf->b_lock);
+                       mutex_exit(&buf->b_evict_lock);
                        mutex_exit(&arc_eviction_mtx);
                } else {
                        arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
@@ -1437,6 +1493,10 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
                kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
                hdr->b_freeze_cksum = NULL;
        }
+       if (hdr->b_thawed) {
+               kmem_free(hdr->b_thawed, 1);
+               hdr->b_thawed = NULL;
+       }
 
        ASSERT(!list_link_active(&hdr->b_arc_node));
        ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1457,11 +1517,17 @@ arc_buf_free(arc_buf_t *buf, void *tag)
                kmutex_t *hash_lock = HDR_LOCK(hdr);
 
                mutex_enter(hash_lock);
+               hdr = buf->b_hdr;
+               ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
                (void) remove_reference(hdr, hash_lock, tag);
-               if (hdr->b_datacnt > 1)
+               if (hdr->b_datacnt > 1) {
                        arc_buf_destroy(buf, FALSE, TRUE);
-               else
+               } else {
+                       ASSERT(buf == hdr->b_buf);
+                       ASSERT(buf->b_efunc == NULL);
                        hdr->b_flags |= ARC_BUF_AVAILABLE;
+               }
                mutex_exit(hash_lock);
        } else if (HDR_IO_IN_PROGRESS(hdr)) {
                int destroy_hdr;
@@ -1478,12 +1544,10 @@ arc_buf_free(arc_buf_t *buf, void *tag)
                if (destroy_hdr)
                        arc_hdr_destroy(hdr);
        } else {
-               if (remove_reference(hdr, NULL, tag) > 0) {
-                       ASSERT(HDR_IO_ERROR(hdr));
+               if (remove_reference(hdr, NULL, tag) > 0)
                        arc_buf_destroy(buf, FALSE, TRUE);
-               } else {
+               else
                        arc_hdr_destroy(hdr);
-               }
        }
 }
 
@@ -1495,11 +1559,14 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
        int no_callback = (buf->b_efunc == NULL);
 
        if (hdr->b_state == arc_anon) {
+               ASSERT(hdr->b_datacnt == 1);
                arc_buf_free(buf, tag);
                return (no_callback);
        }
 
        mutex_enter(hash_lock);
+       hdr = buf->b_hdr;
+       ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
        ASSERT(hdr->b_state != arc_anon);
        ASSERT(buf->b_data != NULL);
 
@@ -1509,6 +1576,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
                        arc_buf_destroy(buf, FALSE, TRUE);
        } else if (no_callback) {
                ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+               ASSERT(buf->b_efunc == NULL);
                hdr->b_flags |= ARC_BUF_AVAILABLE;
        }
        ASSERT(no_callback || hdr->b_datacnt > 1 ||
@@ -1561,7 +1629,8 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                if (HDR_IO_IN_PROGRESS(ab) ||
                    (spa && ab->b_spa != spa) ||
                    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
-                   lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
+                   ddi_get_lbolt() - ab->b_arc_access <
+                   arc_min_prefetch_lifespan)) {
                        skipped++;
                        continue;
                }
@@ -1576,7 +1645,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                        ASSERT(ab->b_datacnt > 0);
                        while (ab->b_buf) {
                                arc_buf_t *buf = ab->b_buf;
-                               if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
+                               if (!mutex_tryenter(&buf->b_evict_lock)) {
                                        missed += 1;
                                        break;
                                }
@@ -1598,13 +1667,28 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                                        buf->b_next = arc_eviction_list;
                                        arc_eviction_list = buf;
                                        mutex_exit(&arc_eviction_mtx);
-                                       rw_exit(&buf->b_lock);
+                                       mutex_exit(&buf->b_evict_lock);
                                } else {
-                                       rw_exit(&buf->b_lock);
+                                       mutex_exit(&buf->b_evict_lock);
                                        arc_buf_destroy(buf,
                                            buf->b_data == stolen, TRUE);
                                }
                        }
+
+                       if (ab->b_l2hdr) {
+                               ARCSTAT_INCR(arcstat_evict_l2_cached,
+                                   ab->b_size);
+                       } else {
+                               if (l2arc_write_eligible(ab->b_spa, ab)) {
+                                       ARCSTAT_INCR(arcstat_evict_l2_eligible,
+                                           ab->b_size);
+                               } else {
+                                       ARCSTAT_INCR(
+                                           arcstat_evict_l2_ineligible,
+                                           ab->b_size);
+                               }
+                       }
+
                        if (ab->b_datacnt == 0) {
                                arc_change_state(evicted_state, ab, hash_lock);
                                ASSERT(HDR_IN_HASH_TABLE(ab));
@@ -1679,6 +1763,9 @@ top:
                if (spa && ab->b_spa != spa)
                        continue;
                hash_lock = HDR_LOCK(ab);
+               /* caller may be trying to modify this buffer, skip it */
+               if (MUTEX_HELD(hash_lock))
+                       continue;
                if (mutex_tryenter(hash_lock)) {
                        ASSERT(!HDR_IO_IN_PROGRESS(ab));
                        ASSERT(ab->b_buf == NULL);
@@ -1799,9 +1886,9 @@ arc_do_user_evicts(void)
        while (arc_eviction_list != NULL) {
                arc_buf_t *buf = arc_eviction_list;
                arc_eviction_list = buf->b_next;
-               rw_enter(&buf->b_lock, RW_WRITER);
+               mutex_enter(&buf->b_evict_lock);
                buf->b_hdr = NULL;
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                mutex_exit(&arc_eviction_mtx);
 
                if (buf->b_efunc != NULL)
@@ -2017,12 +2104,12 @@ arc_reclaim_thread(void)
                        }
 
                        /* reset the growth delay for every reclaim */
-                       growtime = lbolt + (arc_grow_retry * hz);
+                       growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
 
                        arc_kmem_reap_now(last_reclaim);
                        arc_warm = B_TRUE;
 
-               } else if (arc_no_grow && lbolt >= growtime) {
+               } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
                        arc_no_grow = FALSE;
                }
 
@@ -2036,7 +2123,7 @@ arc_reclaim_thread(void)
                /* block until needed, or one second, whichever is shorter */
                CALLB_CPR_SAFE_BEGIN(&cpr);
                (void) cv_timedwait(&arc_reclaim_thr_cv,
-                   &arc_reclaim_thr_lock, (lbolt + hz));
+                   &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
                CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
        }
 
@@ -2251,6 +2338,8 @@ out:
 static void
 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 {
+       clock_t now;
+
        ASSERT(MUTEX_HELD(hash_lock));
 
        if (buf->b_state == arc_anon) {
@@ -2261,11 +2350,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                 */
 
                ASSERT(buf->b_arc_access == 0);
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
                DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
                arc_change_state(arc_mru, buf, hash_lock);
 
        } else if (buf->b_state == arc_mru) {
+               now = ddi_get_lbolt();
+
                /*
                 * If this buffer is here because of a prefetch, then either:
                 * - clear the flag if this is a "referencing" read
@@ -2281,7 +2372,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                                buf->b_flags &= ~ARC_PREFETCH;
                                ARCSTAT_BUMP(arcstat_mru_hits);
                        }
-                       buf->b_arc_access = lbolt;
+                       buf->b_arc_access = now;
                        return;
                }
 
@@ -2290,13 +2381,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                 * but it is still in the cache. Move it to the MFU
                 * state.
                 */
-               if (lbolt > buf->b_arc_access + ARC_MINTIME) {
+               if (now > buf->b_arc_access + ARC_MINTIME) {
                        /*
                         * More than 125ms have passed since we
                         * instantiated this buffer.  Move it to the
                         * most frequently used state.
                         */
-                       buf->b_arc_access = lbolt;
+                       buf->b_arc_access = now;
                        DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                        arc_change_state(arc_mfu, buf, hash_lock);
                }
@@ -2319,7 +2410,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                        DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                }
 
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
                arc_change_state(new_state, buf, hash_lock);
 
                ARCSTAT_BUMP(arcstat_mru_ghost_hits);
@@ -2338,7 +2429,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                        ASSERT(list_link_active(&buf->b_arc_node));
                }
                ARCSTAT_BUMP(arcstat_mfu_hits);
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
        } else if (buf->b_state == arc_mfu_ghost) {
                arc_state_t     *new_state = arc_mfu;
                /*
@@ -2356,7 +2447,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                        new_state = arc_mru;
                }
 
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
                DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                arc_change_state(new_state, buf, hash_lock);
 
@@ -2366,7 +2457,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                 * This buffer is on the 2nd Level ARC.
                 */
 
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
                DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                arc_change_state(arc_mfu, buf, hash_lock);
        } else {
@@ -2379,7 +2470,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 void
 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
-       bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+       if (zio == NULL || zio->io_error == 0)
+               bcopy(buf->b_data, arg, buf->b_hdr->b_size);
        VERIFY(arc_buf_remove_ref(buf, arg) == 1);
 }
 
@@ -2393,6 +2485,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
                *bufp = NULL;
        } else {
                *bufp = buf;
+               ASSERT(buf->b_data);
        }
 }
 
@@ -2431,7 +2524,7 @@ arc_read_done(zio_t *zio)
        /* byteswap if necessary */
        callback_list = hdr->b_acb;
        ASSERT(callback_list != NULL);
-       if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+       if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
                arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
                    byteswap_uint64_array :
                    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
@@ -2440,6 +2533,16 @@ arc_read_done(zio_t *zio)
 
        arc_cksum_compute(buf, B_FALSE);
 
+       if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+               /*
+                * Only call arc_access on anonymous buffers.  This is because
+                * if we've issued an I/O for an evicted buffer, we've already
+                * called arc_access (to prevent any simultaneous readers from
+                * getting confused).
+                */
+               arc_access(hdr, hash_lock);
+       }
+
        /* create copies of the data buffer for the callers */
        abuf = buf;
        for (acb = callback_list; acb; acb = acb->acb_next) {
@@ -2453,8 +2556,11 @@ arc_read_done(zio_t *zio)
        hdr->b_acb = NULL;
        hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
        ASSERT(!HDR_BUF_AVAILABLE(hdr));
-       if (abuf == buf)
+       if (abuf == buf) {
+               ASSERT(buf->b_efunc == NULL);
+               ASSERT(hdr->b_datacnt == 1);
                hdr->b_flags |= ARC_BUF_AVAILABLE;
+       }
 
        ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
 
@@ -2475,14 +2581,6 @@ arc_read_done(zio_t *zio)
        cv_broadcast(&hdr->b_cv);
 
        if (hash_lock) {
-               /*
-                * Only call arc_access on anonymous buffers.  This is because
-                * if we've issued an I/O for an evicted buffer, we've already
-                * called arc_access (to prevent any simultaneous readers from
-                * getting confused).
-                */
-               if (zio->io_error == 0 && hdr->b_state == arc_anon)
-                       arc_access(hdr, hash_lock);
                mutex_exit(hash_lock);
        } else {
                /*
@@ -2536,25 +2634,34 @@ arc_read_done(zio_t *zio)
  * arc_read_bp.
  */
 int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb)
 {
        int err;
 
+       if (pbuf == NULL) {
+               /*
+                * XXX This happens from traverse callback funcs, for
+                * the objset_phys_t block.
+                */
+               return (arc_read_nolock(pio, spa, bp, done, private, priority,
+                   zio_flags, arc_flags, zb));
+       }
+
        ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
        ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
-       rw_enter(&pbuf->b_lock, RW_READER);
+       rw_enter(&pbuf->b_data_lock, RW_READER);
 
        err = arc_read_nolock(pio, spa, bp, done, private, priority,
            zio_flags, arc_flags, zb);
-       rw_exit(&pbuf->b_lock);
+       rw_exit(&pbuf->b_data_lock);
 
        return (err);
 }
 
 int
-arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb)
 {
@@ -2565,7 +2672,8 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
        uint64_t guid = spa_guid(spa);
 
 top:
-       hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+       hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+           &hash_lock);
        if (hdr && hdr->b_datacnt > 0) {
 
                *arc_flags |= ARC_CACHED;
@@ -2619,6 +2727,7 @@ top:
                        } else {
                                buf = arc_buf_clone(buf);
                        }
+
                } else if (*arc_flags & ARC_PREFETCH &&
                    refcount_count(&hdr->b_refcnt) == 0) {
                        hdr->b_flags |= ARC_PREFETCH;
@@ -2649,15 +2758,13 @@ top:
                        buf = arc_buf_alloc(spa, size, private, type);
                        hdr = buf->b_hdr;
                        hdr->b_dva = *BP_IDENTITY(bp);
-                       hdr->b_birth = bp->blk_birth;
+                       hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
                        hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
                        exists = buf_hash_insert(hdr, &hash_lock);
                        if (exists) {
                                /* somebody beat us to the hash insert */
                                mutex_exit(hash_lock);
-                               bzero(&hdr->b_dva, sizeof (dva_t));
-                               hdr->b_birth = 0;
-                               hdr->b_cksum0 = 0;
+                               buf_discard_identity(hdr);
                                (void) arc_buf_remove_ref(buf, private);
                                goto top; /* restart the IO request */
                        }
@@ -2692,12 +2799,14 @@ top:
                        buf->b_private = NULL;
                        buf->b_next = NULL;
                        hdr->b_buf = buf;
-                       arc_get_data_buf(buf);
                        ASSERT(hdr->b_datacnt == 0);
                        hdr->b_datacnt = 1;
-
+                       arc_get_data_buf(buf);
+                       arc_access(hdr, hash_lock);
                }
 
+               ASSERT(!GHOST_STATE(hdr->b_state));
+
                acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
                acb->acb_done = done;
                acb->acb_private = private;
@@ -2706,17 +2815,6 @@ top:
                hdr->b_acb = acb;
                hdr->b_flags |= ARC_IO_IN_PROGRESS;
 
-               /*
-                * If the buffer has been evicted, migrate it to a present state
-                * before issuing the I/O.  Once we drop the hash-table lock,
-                * the header will be marked as I/O in progress and have an
-                * attached buffer.  At this point, anybody who finds this
-                * buffer ought to notice that it's legit but has a pending I/O.
-                */
-
-               if (GHOST_STATE(hdr->b_state))
-                       arc_access(hdr, hash_lock);
-
                if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
                    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
                        devw = hdr->b_l2hdr->b_dev->l2ad_writing;
@@ -2732,8 +2830,8 @@ top:
                mutex_exit(hash_lock);
 
                ASSERT3U(hdr->b_size, ==, size);
-               DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
-                   zbookmark_t *, zb);
+               DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
+                   uint64_t, size, zbookmark_t *, zb);
                ARCSTAT_BUMP(arcstat_misses);
                ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
                    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
@@ -2819,47 +2917,15 @@ top:
        return (0);
 }
 
-/*
- * arc_read() variant to support pool traversal.  If the block is already
- * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
- * The idea is that we don't want pool traversal filling up memory, but
- * if the ARC already has the data anyway, we shouldn't pay for the I/O.
- */
-int
-arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
-{
-       arc_buf_hdr_t *hdr;
-       kmutex_t *hash_mtx;
-       uint64_t guid = spa_guid(spa);
-       int rc = 0;
-
-       hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
-
-       if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
-               arc_buf_t *buf = hdr->b_buf;
-
-               ASSERT(buf);
-               while (buf->b_data == NULL) {
-                       buf = buf->b_next;
-                       ASSERT(buf);
-               }
-               bcopy(buf->b_data, data, hdr->b_size);
-       } else {
-               rc = ENOENT;
-       }
-
-       if (hash_mtx)
-               mutex_exit(hash_mtx);
-
-       return (rc);
-}
-
 void
 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
 {
        ASSERT(buf->b_hdr != NULL);
        ASSERT(buf->b_hdr->b_state != arc_anon);
        ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+       ASSERT(buf->b_efunc == NULL);
+       ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
+
        buf->b_efunc = func;
        buf->b_private = private;
 }
@@ -2876,14 +2942,14 @@ arc_buf_evict(arc_buf_t *buf)
        kmutex_t *hash_lock;
        arc_buf_t **bufp;
 
-       rw_enter(&buf->b_lock, RW_WRITER);
+       mutex_enter(&buf->b_evict_lock);
        hdr = buf->b_hdr;
        if (hdr == NULL) {
                /*
                 * We are in arc_do_user_evicts().
                 */
                ASSERT(buf->b_data == NULL);
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                return (0);
        } else if (buf->b_data == NULL) {
                arc_buf_t copy = *buf; /* structure assignment */
@@ -2892,14 +2958,15 @@ arc_buf_evict(arc_buf_t *buf)
                 * but let arc_do_user_evicts() do the reaping.
                 */
                buf->b_efunc = NULL;
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                VERIFY(copy.b_efunc(&copy) == 0);
                return (1);
        }
        hash_lock = HDR_LOCK(hdr);
        mutex_enter(hash_lock);
+       hdr = buf->b_hdr;
+       ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
-       ASSERT(buf->b_hdr == hdr);
        ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
        ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
 
@@ -2918,6 +2985,7 @@ arc_buf_evict(arc_buf_t *buf)
                arc_state_t *old_state = hdr->b_state;
                arc_state_t *evicted_state;
 
+               ASSERT(hdr->b_buf == NULL);
                ASSERT(refcount_is_zero(&hdr->b_refcnt));
 
                evicted_state =
@@ -2935,12 +3003,13 @@ arc_buf_evict(arc_buf_t *buf)
                mutex_exit(&old_state->arcs_mtx);
        }
        mutex_exit(hash_lock);
-       rw_exit(&buf->b_lock);
+       mutex_exit(&buf->b_evict_lock);
 
        VERIFY(buf->b_efunc(buf) == 0);
        buf->b_efunc = NULL;
        buf->b_private = NULL;
        buf->b_hdr = NULL;
+       buf->b_next = NULL;
        kmem_cache_free(buf_cache, buf);
        return (1);
 }
@@ -2955,29 +3024,30 @@ void
 arc_release(arc_buf_t *buf, void *tag)
 {
        arc_buf_hdr_t *hdr;
-       kmutex_t *hash_lock;
+       kmutex_t *hash_lock = NULL;
        l2arc_buf_hdr_t *l2hdr;
        uint64_t buf_size;
-       boolean_t released = B_FALSE;
 
-       rw_enter(&buf->b_lock, RW_WRITER);
+       /*
+        * It would be nice to assert that if it's DMU metadata (level >
+        * 0 || it's the dnode file), then it must be syncing context.
+        * But we don't know that information at this level.
+        */
+
+       mutex_enter(&buf->b_evict_lock);
        hdr = buf->b_hdr;
 
        /* this buffer is not on any list */
        ASSERT(refcount_count(&hdr->b_refcnt) > 0);
-       ASSERT(!(hdr->b_flags & ARC_STORED));
 
        if (hdr->b_state == arc_anon) {
                /* this buffer is already released */
-               ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
-               ASSERT(BUF_EMPTY(hdr));
                ASSERT(buf->b_efunc == NULL);
-               arc_buf_thaw(buf);
-               rw_exit(&buf->b_lock);
-               released = B_TRUE;
        } else {
                hash_lock = HDR_LOCK(hdr);
                mutex_enter(hash_lock);
+               hdr = buf->b_hdr;
+               ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
        }
 
        l2hdr = hdr->b_l2hdr;
@@ -2987,9 +3057,6 @@ arc_release(arc_buf_t *buf, void *tag)
                buf_size = hdr->b_size;
        }
 
-       if (released)
-               goto out;
-
        /*
         * Do we have more than one buf?
         */
@@ -3003,14 +3070,14 @@ arc_release(arc_buf_t *buf, void *tag)
 
                ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
                /*
-                * Pull the data off of this buf and attach it to
-                * a new anonymous buf.
+                * Pull the data off of this hdr and attach it to
+                * a new anonymous hdr.
                 */
                (void) remove_reference(hdr, hash_lock, tag);
                bufp = &hdr->b_buf;
                while (*bufp != buf)
                        bufp = &(*bufp)->b_next;
-               *bufp = (*bufp)->b_next;
+               *bufp = buf->b_next;
                buf->b_next = NULL;
 
                ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
@@ -3038,26 +3105,25 @@ arc_release(arc_buf_t *buf, void *tag)
                nhdr->b_freeze_cksum = NULL;
                (void) refcount_add(&nhdr->b_refcnt, tag);
                buf->b_hdr = nhdr;
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                atomic_add_64(&arc_anon->arcs_size, blksz);
        } else {
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                ASSERT(refcount_count(&hdr->b_refcnt) == 1);
                ASSERT(!list_link_active(&hdr->b_arc_node));
                ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-               arc_change_state(arc_anon, hdr, hash_lock);
+               if (hdr->b_state != arc_anon)
+                       arc_change_state(arc_anon, hdr, hash_lock);
                hdr->b_arc_access = 0;
-               mutex_exit(hash_lock);
+               if (hash_lock)
+                       mutex_exit(hash_lock);
 
-               bzero(&hdr->b_dva, sizeof (dva_t));
-               hdr->b_birth = 0;
-               hdr->b_cksum0 = 0;
+               buf_discard_identity(hdr);
                arc_buf_thaw(buf);
        }
        buf->b_efunc = NULL;
        buf->b_private = NULL;
 
-out:
        if (l2hdr) {
                list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
                kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
@@ -3066,14 +3132,27 @@ out:
        }
 }
 
+/*
+ * Release this buffer.  If it does not match the provided BP, fill it
+ * with that block's contents.
+ */
+/* ARGSUSED */
+int
+arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+    zbookmark_t *zb)
+{
+       arc_release(buf, tag);
+       return (0);
+}
+
 int
 arc_released(arc_buf_t *buf)
 {
        int released;
 
-       rw_enter(&buf->b_lock, RW_READER);
+       mutex_enter(&buf->b_evict_lock);
        released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
-       rw_exit(&buf->b_lock);
+       mutex_exit(&buf->b_evict_lock);
        return (released);
 }
 
@@ -3082,9 +3161,9 @@ arc_has_callback(arc_buf_t *buf)
 {
        int callback;
 
-       rw_enter(&buf->b_lock, RW_READER);
+       mutex_enter(&buf->b_evict_lock);
        callback = (buf->b_efunc != NULL);
-       rw_exit(&buf->b_lock);
+       mutex_exit(&buf->b_evict_lock);
        return (callback);
 }
 
@@ -3094,9 +3173,9 @@ arc_referenced(arc_buf_t *buf)
 {
        int referenced;
 
-       rw_enter(&buf->b_lock, RW_READER);
+       mutex_enter(&buf->b_evict_lock);
        referenced = (refcount_count(&buf->b_hdr->b_refcnt));
-       rw_exit(&buf->b_lock);
+       mutex_exit(&buf->b_evict_lock);
        return (referenced);
 }
 #endif
@@ -3136,21 +3215,28 @@ arc_write_done(zio_t *zio)
        arc_buf_t *buf = callback->awcb_buf;
        arc_buf_hdr_t *hdr = buf->b_hdr;
 
-       hdr->b_acb = NULL;
+       ASSERT(hdr->b_acb == NULL);
+
+       if (zio->io_error == 0) {
+               hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+               hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+               hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+       } else {
+               ASSERT(BUF_EMPTY(hdr));
+       }
 
-       hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-       hdr->b_birth = zio->io_bp->blk_birth;
-       hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
        /*
         * If the block to be written was all-zero, we may have
         * compressed it away.  In this case no write was performed
-        * so there will be no dva/birth-date/checksum.  The buffer
-        * must therefor remain anonymous (and uncached).
+        * so there will be no dva/birth/checksum.  The buffer must
+        * therefore remain anonymous (and uncached).
         */
        if (!BUF_EMPTY(hdr)) {
                arc_buf_hdr_t *exists;
                kmutex_t *hash_lock;
 
+               ASSERT(zio->io_error == 0);
+
                arc_cksum_verify(buf);
 
                exists = buf_hash_insert(hdr, &hash_lock);
@@ -3160,106 +3246,54 @@ arc_write_done(zio_t *zio)
                         * sync-to-convergence, because we remove
                         * buffers from the hash table when we arc_free().
                         */
-                       ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
-                       ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
-                           BP_IDENTITY(zio->io_bp)));
-                       ASSERT3U(zio->io_bp_orig.blk_birth, ==,
-                           zio->io_bp->blk_birth);
-
-                       ASSERT(refcount_is_zero(&exists->b_refcnt));
-                       arc_change_state(arc_anon, exists, hash_lock);
-                       mutex_exit(hash_lock);
-                       arc_hdr_destroy(exists);
-                       exists = buf_hash_insert(hdr, &hash_lock);
-                       ASSERT3P(exists, ==, NULL);
+                       if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+                               if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+                                       panic("bad overwrite, hdr=%p exists=%p",
+                                           (void *)hdr, (void *)exists);
+                               ASSERT(refcount_is_zero(&exists->b_refcnt));
+                               arc_change_state(arc_anon, exists, hash_lock);
+                               mutex_exit(hash_lock);
+                               arc_hdr_destroy(exists);
+                               exists = buf_hash_insert(hdr, &hash_lock);
+                               ASSERT3P(exists, ==, NULL);
+                       } else {
+                               /* Dedup */
+                               ASSERT(hdr->b_datacnt == 1);
+                               ASSERT(hdr->b_state == arc_anon);
+                               ASSERT(BP_GET_DEDUP(zio->io_bp));
+                               ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+                       }
                }
                hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
                /* if it's not anon, we are doing a scrub */
-               if (hdr->b_state == arc_anon)
+               if (!exists && hdr->b_state == arc_anon)
                        arc_access(hdr, hash_lock);
                mutex_exit(hash_lock);
-       } else if (callback->awcb_done == NULL) {
-               int destroy_hdr;
-               /*
-                * This is an anonymous buffer with no user callback,
-                * destroy it if there are no active references.
-                */
-               mutex_enter(&arc_eviction_mtx);
-               destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
-               hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-               mutex_exit(&arc_eviction_mtx);
-               if (destroy_hdr)
-                       arc_hdr_destroy(hdr);
        } else {
                hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
        }
-       hdr->b_flags &= ~ARC_STORED;
 
-       if (callback->awcb_done) {
-               ASSERT(!refcount_is_zero(&hdr->b_refcnt));
-               callback->awcb_done(zio, buf, callback->awcb_private);
-       }
+       ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+       callback->awcb_done(zio, buf, callback->awcb_private);
 
        kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
-void
-write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
-{
-       boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
-
-       /* Determine checksum setting */
-       if (ismd) {
-               /*
-                * Metadata always gets checksummed.  If the data
-                * checksum is multi-bit correctable, and it's not a
-                * ZBT-style checksum, then it's suitable for metadata
-                * as well.  Otherwise, the metadata checksum defaults
-                * to fletcher4.
-                */
-               if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
-                   !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
-                       zp->zp_checksum = wp->wp_oschecksum;
-               else
-                       zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
-       } else {
-               zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
-                   wp->wp_oschecksum);
-       }
-
-       /* Determine compression setting */
-       if (ismd) {
-               /*
-                * XXX -- we should design a compression algorithm
-                * that specializes in arrays of bps.
-                */
-               zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
-                   ZIO_COMPRESS_LZJB;
-       } else {
-               zp->zp_compress = zio_compress_select(wp->wp_dncompress,
-                   wp->wp_oscompress);
-       }
-
-       zp->zp_type = wp->wp_type;
-       zp->zp_level = wp->wp_level;
-       zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
-}
-
 zio_t *
-arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
-    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int zio_flags, const zbookmark_t *zb)
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+    arc_done_func_t *ready, arc_done_func_t *done, void *private,
+    int priority, int zio_flags, const zbookmark_t *zb)
 {
        arc_buf_hdr_t *hdr = buf->b_hdr;
        arc_write_callback_t *callback;
        zio_t *zio;
-       zio_prop_t zp;
 
        ASSERT(ready != NULL);
+       ASSERT(done != NULL);
        ASSERT(!HDR_IO_ERROR(hdr));
        ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
-       ASSERT(hdr->b_acb == 0);
+       ASSERT(hdr->b_acb == NULL);
        if (l2arc)
                hdr->b_flags |= ARC_L2CACHE;
        callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
@@ -3268,88 +3302,12 @@ arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
        callback->awcb_private = private;
        callback->awcb_buf = buf;
 
-       write_policy(spa, wp, &zp);
-       zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+       zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
            arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
 
        return (zio);
 }
 
-int
-arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags)
-{
-       arc_buf_hdr_t *ab;
-       kmutex_t *hash_lock;
-       zio_t   *zio;
-       uint64_t guid = spa_guid(spa);
-
-       /*
-        * If this buffer is in the cache, release it, so it
-        * can be re-used.
-        */
-       ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
-       if (ab != NULL) {
-               /*
-                * The checksum of blocks to free is not always
-                * preserved (eg. on the deadlist).  However, if it is
-                * nonzero, it should match what we have in the cache.
-                */
-               ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
-                   bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
-                   bp->blk_fill == BLK_FILL_ALREADY_FREED);
-
-               if (ab->b_state != arc_anon)
-                       arc_change_state(arc_anon, ab, hash_lock);
-               if (HDR_IO_IN_PROGRESS(ab)) {
-                       /*
-                        * This should only happen when we prefetch.
-                        */
-                       ASSERT(ab->b_flags & ARC_PREFETCH);
-                       ASSERT3U(ab->b_datacnt, ==, 1);
-                       ab->b_flags |= ARC_FREED_IN_READ;
-                       if (HDR_IN_HASH_TABLE(ab))
-                               buf_hash_remove(ab);
-                       ab->b_arc_access = 0;
-                       bzero(&ab->b_dva, sizeof (dva_t));
-                       ab->b_birth = 0;
-                       ab->b_cksum0 = 0;
-                       ab->b_buf->b_efunc = NULL;
-                       ab->b_buf->b_private = NULL;
-                       mutex_exit(hash_lock);
-               } else if (refcount_is_zero(&ab->b_refcnt)) {
-                       ab->b_flags |= ARC_FREE_IN_PROGRESS;
-                       mutex_exit(hash_lock);
-                       arc_hdr_destroy(ab);
-                       ARCSTAT_BUMP(arcstat_deleted);
-               } else {
-                       /*
-                        * We still have an active reference on this
-                        * buffer.  This can happen, e.g., from
-                        * dbuf_unoverride().
-                        */
-                       ASSERT(!HDR_IN_HASH_TABLE(ab));
-                       ab->b_arc_access = 0;
-                       bzero(&ab->b_dva, sizeof (dva_t));
-                       ab->b_birth = 0;
-                       ab->b_cksum0 = 0;
-                       ab->b_buf->b_efunc = NULL;
-                       ab->b_buf->b_private = NULL;
-                       mutex_exit(hash_lock);
-               }
-       }
-
-       zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
-
-       if (arc_flags & ARC_WAIT)
-               return (zio_wait(zio));
-
-       ASSERT(arc_flags & ARC_NOWAIT);
-       zio_nowait(zio);
-
-       return (0);
-}
-
 static int
 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
 {
@@ -3786,12 +3744,11 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
        /*
         * A buffer is *not* eligible for the L2ARC if it:
         * 1. belongs to a different spa.
-        * 2. has no attached buffer.
-        * 3. is already cached on the L2ARC.
-        * 4. has an I/O in progress (it may be an incomplete read).
-        * 5. is flagged not eligible (zfs property).
+        * 2. is already cached on the L2ARC.
+        * 3. has an I/O in progress (it may be an incomplete read).
+        * 4. is flagged not eligible (zfs property).
         */
-       if (ab->b_spa != spa_guid || ab->b_buf == NULL || ab->b_l2hdr != NULL ||
+       if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
            HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
                return (B_FALSE);
 
@@ -3815,7 +3772,7 @@ l2arc_write_size(l2arc_dev_t *dev)
 static clock_t
 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 {
-       clock_t interval, next;
+       clock_t interval, next, now;
 
        /*
         * If the ARC lists are busy, increase our write rate; if the
@@ -3828,7 +3785,8 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
        else
                interval = hz * l2arc_feed_secs;
 
-       next = MAX(lbolt, MIN(lbolt + interval, began + interval));
+       now = ddi_get_lbolt();
+       next = MAX(now, MIN(now + interval, began + interval));
 
        return (next);
 }
@@ -4030,11 +3988,11 @@ l2arc_read_done(zio_t *zio)
        ASSERT(cb != NULL);
        buf = cb->l2rcb_buf;
        ASSERT(buf != NULL);
-       hdr = buf->b_hdr;
-       ASSERT(hdr != NULL);
 
-       hash_lock = HDR_LOCK(hdr);
+       hash_lock = HDR_LOCK(buf->b_hdr);
        mutex_enter(hash_lock);
+       hdr = buf->b_hdr;
+       ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
        /*
         * Check this survived the L2ARC journey.
@@ -4247,7 +4205,7 @@ top:
        }
        mutex_exit(&l2arc_buflist_mtx);
 
-       spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+       vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
        dev->l2ad_evict = taddr;
 }
 
@@ -4407,15 +4365,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
        ARCSTAT_BUMP(arcstat_l2_writes_sent);
        ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
        ARCSTAT_INCR(arcstat_l2_size, write_sz);
-       spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+       vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
 
        /*
         * Bump device hand to the device start if it is approaching the end.
         * l2arc_evict() will already have evicted ahead for this case.
         */
        if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
-               spa_l2cache_space_update(dev->l2ad_vdev, 0,
-                   dev->l2ad_end - dev->l2ad_hand);
+               vdev_space_update(dev->l2ad_vdev,
+                   dev->l2ad_end - dev->l2ad_hand, 0, 0);
                dev->l2ad_hand = dev->l2ad_start;
                dev->l2ad_evict = dev->l2ad_start;
                dev->l2ad_first = B_FALSE;
@@ -4439,7 +4397,7 @@ l2arc_feed_thread(void)
        l2arc_dev_t *dev;
        spa_t *spa;
        uint64_t size, wrote;
-       clock_t begin, next = lbolt;
+       clock_t begin, next = ddi_get_lbolt();
 
        CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
@@ -4450,7 +4408,7 @@ l2arc_feed_thread(void)
                (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
                    next);
                CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
-               next = lbolt + hz;
+               next = ddi_get_lbolt() + hz;
 
                /*
                 * Quick check for L2ARC devices.
@@ -4461,7 +4419,7 @@ l2arc_feed_thread(void)
                        continue;
                }
                mutex_exit(&l2arc_dev_mtx);
-               begin = lbolt;
+               begin = ddi_get_lbolt();
 
                /*
                 * This selects the next l2arc device to write to, and in
@@ -4566,7 +4524,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
        list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
            offsetof(arc_buf_hdr_t, b_l2node));
 
-       spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+       vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 
        /*
         * Add device to global list