Linux 3.5 compat, eops->encode_fh() takes inodes

[zfs.git] / module / zfs / arc.c
diff --git a/module/zfs/arc.c b/module/zfs/arc.c

index d5e5aa5..aad37a3 100644 (file)
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -19,8 +19,9 @@
   * CDDL HEADER END
   */
  /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
   */
  
  /*
@@ -105,6 +106,14 @@
   * protected from simultaneous callbacks from arc_buf_evict()
   * and arc_do_user_evicts().
   *
+ * It as also possible to register a callback which is run when the
+ * arc_meta_limit is reached and no buffers can be safely evicted.  In
+ * this case the arc user should drop a reference on some arc buffers so
+ * they can be reclaimed and the arc_meta_limit honored.  For example,
+ * when using the ZPL each dentry holds a references on a znode.  These
+ * dentries must be pruned before the arc buffer holding the znode can
+ * be safely evicted.
+ *
   * Note that the majority of the performance stats are manipulated
   * with atomic operations.
   *
@@ -119,20 +128,20 @@
  
  #include <sys/spa.h>
  #include <sys/zio.h>
-#include <sys/zio_checksum.h>
  #include <sys/zfs_context.h>
  #include <sys/arc.h>
-#include <sys/refcount.h>
  #include <sys/vdev.h>
  #include <sys/vdev_impl.h>
  #ifdef _KERNEL
  #include <sys/vmsystm.h>
  #include <vm/anon.h>
  #include <sys/fs/swapnode.h>
-#include <sys/dnlc.h>
+#include <sys/zpl.h>
  #endif
  #include <sys/callb.h>
  #include <sys/kstat.h>
+#include <sys/dmu_tx.h>
+#include <zfs_fletcher.h>
  
  static kmutex_t                arc_reclaim_thr_lock;
  static kcondvar_t      arc_reclaim_thr_cv;     /* used to signal reclaim thr */
@@ -142,8 +151,8 @@ extern int zfs_write_limit_shift;
  extern uint64_t zfs_write_limit_max;
  extern kmutex_t zfs_write_limit_lock;
  
-#define        ARC_REDUCE_DNLC_PERCENT 3
-uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+/* number of bytes to prune from caches when at arc_meta_limit is reached */
+uint_t arc_meta_prune = 1048576;
  
  typedef enum arc_reclaim_strategy {
         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
@@ -151,7 +160,10 @@ typedef enum arc_reclaim_strategy {
  } arc_reclaim_strategy_t;
  
  /* number of seconds before growing cache again */
-static int             arc_grow_retry = 60;
+static int             arc_grow_retry = 5;
+
+/* expiration time for arc_no_grow */
+static clock_t         arc_grow_time = 0;
  
  /* shift of arc_c for calculating both min and max arc_p */
  static int             arc_p_min_shift = 4;
@@ -175,13 +187,13 @@ static boolean_t arc_warm;
  /*
   * These tunables are for performance analysis.
   */
-uint64_t zfs_arc_max;
-uint64_t zfs_arc_min;
-uint64_t zfs_arc_meta_limit = 0;
-int zfs_mdcomp_disable = 0;
+unsigned long zfs_arc_max = 0;
+unsigned long zfs_arc_min = 0;
+unsigned long zfs_arc_meta_limit = 0;
  int zfs_arc_grow_retry = 0;
  int zfs_arc_shrink_shift = 0;
  int zfs_arc_p_min_shift = 0;
+int zfs_arc_meta_prune = 0;
  
  /*
   * Note that buffers can be in one of 6 states:
@@ -249,6 +261,9 @@ typedef struct arc_stats {
         kstat_named_t arcstat_recycle_miss;
         kstat_named_t arcstat_mutex_miss;
         kstat_named_t arcstat_evict_skip;
+       kstat_named_t arcstat_evict_l2_cached;
+       kstat_named_t arcstat_evict_l2_eligible;
+       kstat_named_t arcstat_evict_l2_ineligible;
         kstat_named_t arcstat_hash_elements;
         kstat_named_t arcstat_hash_elements_max;
         kstat_named_t arcstat_hash_collisions;
@@ -262,6 +277,21 @@ typedef struct arc_stats {
         kstat_named_t arcstat_hdr_size;
         kstat_named_t arcstat_data_size;
         kstat_named_t arcstat_other_size;
+       kstat_named_t arcstat_anon_size;
+       kstat_named_t arcstat_anon_evict_data;
+       kstat_named_t arcstat_anon_evict_metadata;
+       kstat_named_t arcstat_mru_size;
+       kstat_named_t arcstat_mru_evict_data;
+       kstat_named_t arcstat_mru_evict_metadata;
+       kstat_named_t arcstat_mru_ghost_size;
+       kstat_named_t arcstat_mru_ghost_evict_data;
+       kstat_named_t arcstat_mru_ghost_evict_metadata;
+       kstat_named_t arcstat_mfu_size;
+       kstat_named_t arcstat_mfu_evict_data;
+       kstat_named_t arcstat_mfu_evict_metadata;
+       kstat_named_t arcstat_mfu_ghost_size;
+       kstat_named_t arcstat_mfu_ghost_evict_data;
+       kstat_named_t arcstat_mfu_ghost_evict_metadata;
         kstat_named_t arcstat_l2_hits;
         kstat_named_t arcstat_l2_misses;
         kstat_named_t arcstat_l2_feeds;
@@ -281,6 +311,15 @@ typedef struct arc_stats {
         kstat_named_t arcstat_l2_size;
         kstat_named_t arcstat_l2_hdr_size;
         kstat_named_t arcstat_memory_throttle_count;
+       kstat_named_t arcstat_memory_direct_count;
+       kstat_named_t arcstat_memory_indirect_count;
+       kstat_named_t arcstat_no_grow;
+       kstat_named_t arcstat_tempreserve;
+       kstat_named_t arcstat_loaned_bytes;
+       kstat_named_t arcstat_prune;
+       kstat_named_t arcstat_meta_used;
+       kstat_named_t arcstat_meta_limit;
+       kstat_named_t arcstat_meta_max;
  } arc_stats_t;
  
  static arc_stats_t arc_stats = {
@@ -302,6 +341,9 @@ static arc_stats_t arc_stats = {
         { "recycle_miss",               KSTAT_DATA_UINT64 },
         { "mutex_miss",                 KSTAT_DATA_UINT64 },
         { "evict_skip",                 KSTAT_DATA_UINT64 },
+       { "evict_l2_cached",            KSTAT_DATA_UINT64 },
+       { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
+       { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
         { "hash_elements",              KSTAT_DATA_UINT64 },
         { "hash_elements_max",          KSTAT_DATA_UINT64 },
         { "hash_collisions",            KSTAT_DATA_UINT64 },
@@ -315,6 +357,21 @@ static arc_stats_t arc_stats = {
         { "hdr_size",                   KSTAT_DATA_UINT64 },
         { "data_size",                  KSTAT_DATA_UINT64 },
         { "other_size",                 KSTAT_DATA_UINT64 },
+       { "anon_size",                  KSTAT_DATA_UINT64 },
+       { "anon_evict_data",            KSTAT_DATA_UINT64 },
+       { "anon_evict_metadata",        KSTAT_DATA_UINT64 },
+       { "mru_size",                   KSTAT_DATA_UINT64 },
+       { "mru_evict_data",             KSTAT_DATA_UINT64 },
+       { "mru_evict_metadata",         KSTAT_DATA_UINT64 },
+       { "mru_ghost_size",             KSTAT_DATA_UINT64 },
+       { "mru_ghost_evict_data",       KSTAT_DATA_UINT64 },
+       { "mru_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
+       { "mfu_size",                   KSTAT_DATA_UINT64 },
+       { "mfu_evict_data",             KSTAT_DATA_UINT64 },
+       { "mfu_evict_metadata",         KSTAT_DATA_UINT64 },
+       { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
+       { "mfu_ghost_evict_data",       KSTAT_DATA_UINT64 },
+       { "mfu_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
         { "l2_hits",                    KSTAT_DATA_UINT64 },
         { "l2_misses",                  KSTAT_DATA_UINT64 },
         { "l2_feeds",                   KSTAT_DATA_UINT64 },
@@ -333,7 +390,16 @@ static arc_stats_t arc_stats = {
         { "l2_io_error",                KSTAT_DATA_UINT64 },
         { "l2_size",                    KSTAT_DATA_UINT64 },
         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
-       { "memory_throttle_count",      KSTAT_DATA_UINT64 }
+       { "memory_throttle_count",      KSTAT_DATA_UINT64 },
+       { "memory_direct_count",        KSTAT_DATA_UINT64 },
+       { "memory_indirect_count",      KSTAT_DATA_UINT64 },
+       { "arc_no_grow",                KSTAT_DATA_UINT64 },
+       { "arc_tempreserve",            KSTAT_DATA_UINT64 },
+       { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
+       { "arc_prune",                  KSTAT_DATA_UINT64 },
+       { "arc_meta_used",              KSTAT_DATA_UINT64 },
+       { "arc_meta_limit",             KSTAT_DATA_UINT64 },
+       { "arc_meta_max",               KSTAT_DATA_UINT64 },
  };
  
  #define        ARCSTAT(stat)   (arc_stats.stat.value.ui64)
@@ -341,7 +407,7 @@ static arc_stats_t arc_stats = {
  #define        ARCSTAT_INCR(stat, val) \
         atomic_add_64(&arc_stats.stat.value.ui64, (val));
  
-#define        ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
+#define        ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
  #define        ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
  
  #define        ARCSTAT_MAX(stat, val) {                                        \
@@ -375,7 +441,7 @@ static arc_stats_t arc_stats = {
         }
  
  kstat_t                        *arc_ksp;
-static arc_state_t     *arc_anon;
+static arc_state_t     *arc_anon;
  static arc_state_t     *arc_mru;
  static arc_state_t     *arc_mru_ghost;
  static arc_state_t     *arc_mfu;
@@ -395,13 +461,12 @@ static arc_state_t        *arc_l2c_only;
  #define        arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
  #define        arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
  #define        arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
-
-static int             arc_no_grow;    /* Don't try to grow cache size */
-static uint64_t                arc_tempreserve;
-static uint64_t                arc_loaned_bytes;
-static uint64_t                arc_meta_used;
-static uint64_t                arc_meta_limit;
-static uint64_t                arc_meta_max = 0;
+#define        arc_no_grow     ARCSTAT(arcstat_no_grow)
+#define        arc_tempreserve ARCSTAT(arcstat_tempreserve)
+#define        arc_loaned_bytes        ARCSTAT(arcstat_loaned_bytes)
+#define        arc_meta_used   ARCSTAT(arcstat_meta_used)
+#define        arc_meta_limit  ARCSTAT(arcstat_meta_limit)
+#define        arc_meta_max    ARCSTAT(arcstat_meta_max)
  
  typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
  
@@ -432,6 +497,7 @@ struct arc_buf_hdr {
  
         kmutex_t                b_freeze_lock;
         zio_cksum_t             *b_freeze_cksum;
+       void                    *b_thawed;
  
         arc_buf_hdr_t           *b_hash_next;
         arc_buf_t               *b_buf;
@@ -460,6 +526,8 @@ struct arc_buf_hdr {
         list_node_t             b_l2node;
  };
  
+static list_t arc_prune_list;
+static kmutex_t arc_prune_mtx;
  static arc_buf_t *arc_eviction_list;
  static kmutex_t arc_eviction_mtx;
  static arc_buf_hdr_t arc_eviction_hdr;
@@ -468,6 +536,8 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
  static int arc_evict_needed(arc_buf_contents_t type);
  static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
  
+static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
+
  #define        GHOST_STATE(state)      \
         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
         (state) == arc_l2c_only)
@@ -490,7 +560,6 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
  #define        ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
  #define        ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
  #define        ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
-#define        ARC_STORED              (1 << 19)       /* has been store()d to */
  
  #define        HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
  #define        HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
@@ -517,12 +586,13 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
   * Hash table routines
   */
  
-#define        HT_LOCK_PAD     64
+#define        HT_LOCK_ALIGN   64
+#define        HT_LOCK_PAD     (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
  
  struct ht_lock {
         kmutex_t        ht_lock;
  #ifdef _KERNEL
-       unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+       unsigned char   pad[HT_LOCK_PAD];
  #endif
  };
  
@@ -539,8 +609,8 @@ static buf_hash_table_t buf_hash_table;
         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
  #define        BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
  #define        BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define        HDR_LOCK(buf) \
-       (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+#define        HDR_LOCK(hdr) \
+       (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
  
  uint64_t zfs_crc64_table[256];
  
@@ -559,14 +629,14 @@ uint64_t zfs_crc64_table[256];
  /*
   * L2ARC Performance Tunables
   */
-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;   /* default max write size */
-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
-uint64_t l2arc_headroom = L2ARC_HEADROOM;      /* number of dev writes */
-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;    /* interval seconds */
-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;        /* min interval milliseconds */
-boolean_t l2arc_noprefetch = B_TRUE;           /* don't cache prefetch bufs */
-boolean_t l2arc_feed_again = B_TRUE;           /* turbo warmup */
-boolean_t l2arc_norw = B_TRUE;                 /* no reads during writes */
+unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;      /* def max write size */
+unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;    /* extra warmup write */
+unsigned long l2arc_headroom = L2ARC_HEADROOM;         /* # of dev writes */
+unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;       /* interval seconds */
+unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;   /* min interval msecs */
+int l2arc_noprefetch = B_TRUE;                 /* don't cache prefetch bufs */
+int l2arc_feed_again = B_TRUE;                 /* turbo warmup */
+int l2arc_norw = B_TRUE;                       /* no reads during writes */
  
  /*
   * L2ARC Internals
@@ -658,6 +728,15 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
  
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+       hdr->b_dva.dva_word[0] = 0;
+       hdr->b_dva.dva_word[1] = 0;
+       hdr->b_birth = 0;
+       hdr->b_cksum0 = 0;
+}
+
  static arc_buf_hdr_t *
  buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
  {
@@ -757,8 +836,15 @@ buf_fini(void)
  {
         int i;
  
+#if defined(_KERNEL) && defined(HAVE_SPL)
+       /* Large allocations which do not require contiguous pages
+        * should be using vmem_free() in the linux kernel */
+       vmem_free(buf_hash_table.ht_table,
+           (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#else
         kmem_free(buf_hash_table.ht_table,
             (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#endif
         for (i = 0; i < BUF_LOCKS; i++)
                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
         kmem_cache_destroy(hdr_cache);
@@ -779,6 +865,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag)
         refcount_create(&buf->b_refcnt);
         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+       list_link_init(&buf->b_arc_node);
+       list_link_init(&buf->b_l2node);
         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
  
         return (0);
@@ -791,7 +879,8 @@ buf_cons(void *vbuf, void *unused, int kmflag)
         arc_buf_t *buf = vbuf;
  
         bzero(buf, sizeof (arc_buf_t));
-       rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
+       mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+       rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
  
         return (0);
@@ -807,6 +896,7 @@ hdr_dest(void *vbuf, void *unused)
  {
         arc_buf_hdr_t *buf = vbuf;
  
+       ASSERT(BUF_EMPTY(buf));
         refcount_destroy(&buf->b_refcnt);
         cv_destroy(&buf->b_cv);
         mutex_destroy(&buf->b_freeze_lock);
@@ -819,26 +909,11 @@ buf_dest(void *vbuf, void *unused)
  {
         arc_buf_t *buf = vbuf;
  
-       rw_destroy(&buf->b_lock);
+       mutex_destroy(&buf->b_evict_lock);
+       rw_destroy(&buf->b_data_lock);
         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
  }
  
-/*
- * Reclaim callback -- invoked when memory is low.
- */
-/* ARGSUSED */
-static void
-hdr_recl(void *unused)
-{
-       dprintf("hdr_recl called\n");
-       /*
-        * umem calls the reclaim func when we destroy the buf cache,
-        * which is after we do arc_fini().
-        */
-       if (!arc_dead)
-               cv_signal(&arc_reclaim_thr_cv);
-}
-
  static void
  buf_init(void)
  {
@@ -855,8 +930,15 @@ buf_init(void)
                 hsize <<= 1;
  retry:
         buf_hash_table.ht_mask = hsize - 1;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+       /* Large allocations which do not require contiguous pages
+        * should be using vmem_alloc() in the linux kernel */
+       buf_hash_table.ht_table =
+           vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+#else
         buf_hash_table.ht_table =
             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+#endif
         if (buf_hash_table.ht_table == NULL) {
                 ASSERT(hsize > (1ULL << 8));
                 hsize >>= 1;
@@ -864,7 +946,7 @@ retry:
         }
  
         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
-           0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+           0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0);
         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
  
@@ -925,7 +1007,8 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
                 mutex_exit(&buf->b_hdr->b_freeze_lock);
                 return;
         }
-       buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+       buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+                                               KM_PUSHPAGE);
         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
             buf->b_hdr->b_freeze_cksum);
         mutex_exit(&buf->b_hdr->b_freeze_lock);
@@ -947,18 +1030,31 @@ arc_buf_thaw(arc_buf_t *buf)
                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
                 buf->b_hdr->b_freeze_cksum = NULL;
         }
+
+       if (zfs_flags & ZFS_DEBUG_MODIFY) {
+               if (buf->b_hdr->b_thawed)
+                       kmem_free(buf->b_hdr->b_thawed, 1);
+               buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+       }
+
         mutex_exit(&buf->b_hdr->b_freeze_lock);
  }
  
  void
  arc_buf_freeze(arc_buf_t *buf)
  {
+       kmutex_t *hash_lock;
+
         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
                 return;
  
+       hash_lock = HDR_LOCK(buf->b_hdr);
+       mutex_enter(hash_lock);
+
         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
             buf->b_hdr->b_state == arc_anon);
         arc_cksum_compute(buf, B_FALSE);
+       mutex_exit(hash_lock);
  }
  
  static void
@@ -1030,6 +1126,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
         ASSERT(new_state != old_state);
         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+       ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
  
         from_delta = to_delta = ab->b_datacnt * ab->b_size;
  
@@ -1050,7 +1147,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
  
                         /*
                          * If prefetching out of the ghost cache,
-                        * we will have a non-null datacnt.
+                        * we will have a non-zero datacnt.
                          */
                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
                                 /* ghost elements have a ghost size */
@@ -1086,9 +1183,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
         }
  
         ASSERT(!BUF_EMPTY(ab));
-       if (new_state == arc_anon) {
+       if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
                 buf_hash_remove(ab);
-       }
  
         /* adjust state sizes */
         if (to_delta)
@@ -1112,6 +1208,8 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
  
         switch (type) {
+       default:
+               break;
         case ARC_SPACE_DATA:
                 ARCSTAT_INCR(arcstat_data_size, space);
                 break;
@@ -1136,6 +1234,8 @@ arc_space_return(uint64_t space, arc_space_type_t type)
         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
  
         switch (type) {
+       default:
+               break;
         case ARC_SPACE_DATA:
                 ARCSTAT_INCR(arcstat_data_size, -space);
                 break;
@@ -1186,7 +1286,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
         ASSERT(BUF_EMPTY(hdr));
         hdr->b_size = size;
         hdr->b_type = type;
-       hdr->b_spa = spa_guid(spa);
+       hdr->b_spa = spa_load_guid(spa);
         hdr->b_state = arc_anon;
         hdr->b_arc_access = 0;
         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
@@ -1232,14 +1332,29 @@ arc_return_buf(arc_buf_t *buf, void *tag)
  {
         arc_buf_hdr_t *hdr = buf->b_hdr;
  
-       ASSERT(hdr->b_state == arc_anon);
         ASSERT(buf->b_data != NULL);
-       VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
-       VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
+       (void) refcount_add(&hdr->b_refcnt, tag);
+       (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
  
         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
  }
  
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+       arc_buf_hdr_t *hdr;
+
+       ASSERT(buf->b_data != NULL);
+       hdr = buf->b_hdr;
+       (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
+       (void) refcount_remove(&hdr->b_refcnt, tag);
+       buf->b_efunc = NULL;
+       buf->b_private = NULL;
+
+       atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+}
+
  static arc_buf_t *
  arc_buf_clone(arc_buf_t *from)
  {
@@ -1247,6 +1362,8 @@ arc_buf_clone(arc_buf_t *from)
         arc_buf_hdr_t *hdr = from->b_hdr;
         uint64_t size = hdr->b_size;
  
+       ASSERT(hdr->b_state != arc_anon);
+
         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
         buf->b_hdr = hdr;
         buf->b_data = NULL;
@@ -1271,16 +1388,16 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
          * must verify b_data != NULL to know if the add_ref
          * was successful.
          */
-       rw_enter(&buf->b_lock, RW_READER);
+       mutex_enter(&buf->b_evict_lock);
         if (buf->b_data == NULL) {
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                 return;
         }
-       hdr = buf->b_hdr;
-       ASSERT(hdr != NULL);
-       hash_lock = HDR_LOCK(hdr);
+       hash_lock = HDR_LOCK(buf->b_hdr);
         mutex_enter(hash_lock);
-       rw_exit(&buf->b_lock);
+       hdr = buf->b_hdr;
+       ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+       mutex_exit(&buf->b_evict_lock);
  
         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
         add_reference(hdr, hash_lock, tag);
@@ -1328,6 +1445,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
                 arc_buf_contents_t type = buf->b_hdr->b_type;
  
                 arc_cksum_verify(buf);
+
                 if (!recycle) {
                         if (type == ARC_BUFC_METADATA) {
                                 arc_buf_data_free(buf->b_hdr, zio_buf_free,
@@ -1365,6 +1483,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
                 continue;
         *bufp = buf->b_next;
+       buf->b_next = NULL;
  
         ASSERT(buf->b_efunc == NULL);
  
@@ -1376,58 +1495,59 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
  static void
  arc_hdr_destroy(arc_buf_hdr_t *hdr)
  {
+       l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+
         ASSERT(refcount_is_zero(&hdr->b_refcnt));
         ASSERT3P(hdr->b_state, ==, arc_anon);
         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-       ASSERT(!(hdr->b_flags & ARC_STORED));
  
-       if (hdr->b_l2hdr != NULL) {
-               if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
-                       /*
-                        * To prevent arc_free() and l2arc_evict() from
-                        * attempting to free the same buffer at the same time,
-                        * a FREE_IN_PROGRESS flag is given to arc_free() to
-                        * give it priority.  l2arc_evict() can't destroy this
-                        * header while we are waiting on l2arc_buflist_mtx.
-                        *
-                        * The hdr may be removed from l2ad_buflist before we
-                        * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
-                        */
+       if (l2hdr != NULL) {
+               boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+               /*
+                * To prevent arc_free() and l2arc_evict() from
+                * attempting to free the same buffer at the same time,
+                * a FREE_IN_PROGRESS flag is given to arc_free() to
+                * give it priority.  l2arc_evict() can't destroy this
+                * header while we are waiting on l2arc_buflist_mtx.
+                *
+                * The hdr may be removed from l2ad_buflist before we
+                * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+                */
+               if (!buflist_held) {
                         mutex_enter(&l2arc_buflist_mtx);
-                       if (hdr->b_l2hdr != NULL) {
-                               list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
-                                   hdr);
-                       }
-                       mutex_exit(&l2arc_buflist_mtx);
-               } else {
-                       list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+                       l2hdr = hdr->b_l2hdr;
                 }
-               ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
-               kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
-               if (hdr->b_state == arc_l2c_only)
-                       l2arc_hdr_stat_remove();
-               hdr->b_l2hdr = NULL;
+
+               if (l2hdr != NULL) {
+                       list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+                       ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+                       kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+                       if (hdr->b_state == arc_l2c_only)
+                               l2arc_hdr_stat_remove();
+                       hdr->b_l2hdr = NULL;
+               }
+
+               if (!buflist_held)
+                       mutex_exit(&l2arc_buflist_mtx);
         }
  
         if (!BUF_EMPTY(hdr)) {
                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
-               bzero(&hdr->b_dva, sizeof (dva_t));
-               hdr->b_birth = 0;
-               hdr->b_cksum0 = 0;
+               buf_discard_identity(hdr);
         }
         while (hdr->b_buf) {
                 arc_buf_t *buf = hdr->b_buf;
  
                 if (buf->b_efunc) {
                         mutex_enter(&arc_eviction_mtx);
-                       rw_enter(&buf->b_lock, RW_WRITER);
+                       mutex_enter(&buf->b_evict_lock);
                         ASSERT(buf->b_hdr != NULL);
                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
                         hdr->b_buf = buf->b_next;
                         buf->b_hdr = &arc_eviction_hdr;
                         buf->b_next = arc_eviction_list;
                         arc_eviction_list = buf;
-                       rw_exit(&buf->b_lock);
+                       mutex_exit(&buf->b_evict_lock);
                         mutex_exit(&arc_eviction_mtx);
                 } else {
                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
@@ -1437,6 +1557,10 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
                 hdr->b_freeze_cksum = NULL;
         }
+       if (hdr->b_thawed) {
+               kmem_free(hdr->b_thawed, 1);
+               hdr->b_thawed = NULL;
+       }
  
         ASSERT(!list_link_active(&hdr->b_arc_node));
         ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1457,11 +1581,17 @@ arc_buf_free(arc_buf_t *buf, void *tag)
                 kmutex_t *hash_lock = HDR_LOCK(hdr);
  
                 mutex_enter(hash_lock);
+               hdr = buf->b_hdr;
+               ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
                 (void) remove_reference(hdr, hash_lock, tag);
-               if (hdr->b_datacnt > 1)
+               if (hdr->b_datacnt > 1) {
                         arc_buf_destroy(buf, FALSE, TRUE);
-               else
+               } else {
+                       ASSERT(buf == hdr->b_buf);
+                       ASSERT(buf->b_efunc == NULL);
                         hdr->b_flags |= ARC_BUF_AVAILABLE;
+               }
                 mutex_exit(hash_lock);
         } else if (HDR_IO_IN_PROGRESS(hdr)) {
                 int destroy_hdr;
@@ -1478,12 +1608,10 @@ arc_buf_free(arc_buf_t *buf, void *tag)
                 if (destroy_hdr)
                         arc_hdr_destroy(hdr);
         } else {
-               if (remove_reference(hdr, NULL, tag) > 0) {
-                       ASSERT(HDR_IO_ERROR(hdr));
+               if (remove_reference(hdr, NULL, tag) > 0)
                         arc_buf_destroy(buf, FALSE, TRUE);
-               } else {
+               else
                         arc_hdr_destroy(hdr);
-               }
         }
  }
  
@@ -1495,11 +1623,14 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
         int no_callback = (buf->b_efunc == NULL);
  
         if (hdr->b_state == arc_anon) {
+               ASSERT(hdr->b_datacnt == 1);
                 arc_buf_free(buf, tag);
                 return (no_callback);
         }
  
         mutex_enter(hash_lock);
+       hdr = buf->b_hdr;
+       ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
         ASSERT(hdr->b_state != arc_anon);
         ASSERT(buf->b_data != NULL);
  
@@ -1509,6 +1640,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
                         arc_buf_destroy(buf, FALSE, TRUE);
         } else if (no_callback) {
                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+               ASSERT(buf->b_efunc == NULL);
                 hdr->b_flags |= ARC_BUF_AVAILABLE;
         }
         ASSERT(no_callback || hdr->b_datacnt > 1 ||
@@ -1561,7 +1693,8 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                 if (HDR_IO_IN_PROGRESS(ab) ||
                     (spa && ab->b_spa != spa) ||
                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
-                   lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
+                   ddi_get_lbolt() - ab->b_arc_access <
+                   arc_min_prefetch_lifespan)) {
                         skipped++;
                         continue;
                 }
@@ -1576,7 +1709,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                         ASSERT(ab->b_datacnt > 0);
                         while (ab->b_buf) {
                                 arc_buf_t *buf = ab->b_buf;
-                               if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
+                               if (!mutex_tryenter(&buf->b_evict_lock)) {
                                         missed += 1;
                                         break;
                                 }
@@ -1598,13 +1731,28 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                                         buf->b_next = arc_eviction_list;
                                         arc_eviction_list = buf;
                                         mutex_exit(&arc_eviction_mtx);
-                                       rw_exit(&buf->b_lock);
+                                       mutex_exit(&buf->b_evict_lock);
                                 } else {
-                                       rw_exit(&buf->b_lock);
+                                       mutex_exit(&buf->b_evict_lock);
                                         arc_buf_destroy(buf,
                                             buf->b_data == stolen, TRUE);
                                 }
                         }
+
+                       if (ab->b_l2hdr) {
+                               ARCSTAT_INCR(arcstat_evict_l2_cached,
+                                   ab->b_size);
+                       } else {
+                               if (l2arc_write_eligible(ab->b_spa, ab)) {
+                                       ARCSTAT_INCR(arcstat_evict_l2_eligible,
+                                           ab->b_size);
+                               } else {
+                                       ARCSTAT_INCR(
+                                           arcstat_evict_l2_ineligible,
+                                           ab->b_size);
+                               }
+                       }
+
                         if (ab->b_datacnt == 0) {
                                 arc_change_state(evicted_state, ab, hash_lock);
                                 ASSERT(HDR_IN_HASH_TABLE(ab));
@@ -1625,7 +1773,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
         mutex_exit(&state->arcs_mtx);
  
         if (bytes_evicted < bytes)
-               dprintf("only evicted %lld bytes from %x",
+               dprintf("only evicted %lld bytes from %x\n",
                     (longlong_t)bytes_evicted, state);
  
         if (skipped)
@@ -1646,12 +1794,12 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
                         int64_t todelete =
                             MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-                       arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+                       arc_evict_ghost(arc_mru_ghost, 0, todelete);
                 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
                         int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
                             arc_mru_ghost->arcs_size +
                             arc_mfu_ghost->arcs_size - arc_c);
-                       arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+                       arc_evict_ghost(arc_mfu_ghost, 0, todelete);
                 }
         }
  
@@ -1666,19 +1814,29 @@ static void
  arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
  {
         arc_buf_hdr_t *ab, *ab_prev;
+       arc_buf_hdr_t marker;
         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
         kmutex_t *hash_lock;
         uint64_t bytes_deleted = 0;
         uint64_t bufs_skipped = 0;
  
         ASSERT(GHOST_STATE(state));
+       bzero(&marker, sizeof(marker));
  top:
         mutex_enter(&state->arcs_mtx);
         for (ab = list_tail(list); ab; ab = ab_prev) {
                 ab_prev = list_prev(list, ab);
                 if (spa && ab->b_spa != spa)
                         continue;
+
+               /* ignore markers */
+               if (ab->b_spa == 0)
+                       continue;
+
                 hash_lock = HDR_LOCK(ab);
+               /* caller may be trying to modify this buffer, skip it */
+               if (MUTEX_HELD(hash_lock))
+                       continue;
                 if (mutex_tryenter(hash_lock)) {
                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
                         ASSERT(ab->b_buf == NULL);
@@ -1701,15 +1859,21 @@ top:
                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
                         if (bytes >= 0 && bytes_deleted >= bytes)
                                 break;
-               } else {
-                       if (bytes < 0) {
-                               mutex_exit(&state->arcs_mtx);
-                               mutex_enter(hash_lock);
-                               mutex_exit(hash_lock);
-                               goto top;
-                       }
+               } else if (bytes < 0) {
+                       /*
+                        * Insert a list marker and then wait for the
+                        * hash lock to become available. Once its
+                        * available, restart from where we left off.
+                        */
+                       list_insert_after(list, ab, &marker);
+                       mutex_exit(&state->arcs_mtx);
+                       mutex_enter(hash_lock);
+                       mutex_exit(hash_lock);
+                       mutex_enter(&state->arcs_mtx);
+                       ab_prev = list_prev(list, &marker);
+                       list_remove(list, &marker);
+               } else
                         bufs_skipped += 1;
-               }
         }
         mutex_exit(&state->arcs_mtx);
  
@@ -1725,7 +1889,7 @@ top:
         }
  
         if (bytes_deleted < bytes)
-               dprintf("only deleted %lld bytes from %p",
+               dprintf("only deleted %lld bytes from %p\n",
                     (longlong_t)bytes_deleted, state);
  }
  
@@ -1738,18 +1902,19 @@ arc_adjust(void)
          * Adjust MRU size
          */
  
-       adjustment = MIN(arc_size - arc_c,
-           arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
+       adjustment = MIN((int64_t)(arc_size - arc_c),
+           (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
+           arc_p));
  
         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
-               (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
+               (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
                 adjustment -= delta;
         }
  
         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
-               (void) arc_evict(arc_mru, NULL, delta, FALSE,
+               (void) arc_evict(arc_mru, 0, delta, FALSE,
                     ARC_BUFC_METADATA);
         }
  
@@ -1761,14 +1926,14 @@ arc_adjust(void)
  
         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
                 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
-               (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
+               (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
                 adjustment -= delta;
         }
  
         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
                 int64_t delta = MIN(adjustment,
                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
-               (void) arc_evict(arc_mfu, NULL, delta, FALSE,
+               (void) arc_evict(arc_mfu, 0, delta, FALSE,
                     ARC_BUFC_METADATA);
         }
  
@@ -1780,7 +1945,7 @@ arc_adjust(void)
  
         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
-               arc_evict_ghost(arc_mru_ghost, NULL, delta);
+               arc_evict_ghost(arc_mru_ghost, 0, delta);
         }
  
         adjustment =
@@ -1788,8 +1953,50 @@ arc_adjust(void)
  
         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
-               arc_evict_ghost(arc_mfu_ghost, NULL, delta);
+               arc_evict_ghost(arc_mfu_ghost, 0, delta);
+       }
+}
+
+/*
+ * Request that arc user drop references so that N bytes can be released
+ * from the cache.  This provides a mechanism to ensure the arc can honor
+ * the arc_meta_limit and reclaim buffers which are pinned in the cache
+ * by higher layers.  (i.e. the zpl)
+ */
+static void
+arc_do_user_prune(int64_t adjustment)
+{
+       arc_prune_func_t *func;
+       void *private;
+       arc_prune_t *cp, *np;
+
+       mutex_enter(&arc_prune_mtx);
+
+       cp = list_head(&arc_prune_list);
+       while (cp != NULL) {
+               func = cp->p_pfunc;
+               private = cp->p_private;
+               np = list_next(&arc_prune_list, cp);
+               refcount_add(&cp->p_refcnt, func);
+               mutex_exit(&arc_prune_mtx);
+
+               if (func != NULL)
+                       func(adjustment, private);
+
+               mutex_enter(&arc_prune_mtx);
+
+               /* User removed prune callback concurrently with execution */
+               if (refcount_remove(&cp->p_refcnt, func) == 0) {
+                       ASSERT(!list_link_active(&cp->p_node));
+                       refcount_destroy(&cp->p_refcnt);
+                       kmem_free(cp, sizeof (*cp));
+               }
+
+               cp = np;
         }
+
+       ARCSTAT_BUMP(arcstat_prune);
+       mutex_exit(&arc_prune_mtx);
  }
  
  static void
@@ -1799,9 +2006,9 @@ arc_do_user_evicts(void)
         while (arc_eviction_list != NULL) {
                 arc_buf_t *buf = arc_eviction_list;
                 arc_eviction_list = buf->b_next;
-               rw_enter(&buf->b_lock, RW_WRITER);
+               mutex_enter(&buf->b_evict_lock);
                 buf->b_hdr = NULL;
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                 mutex_exit(&arc_eviction_mtx);
  
                 if (buf->b_efunc != NULL)
@@ -1816,6 +2023,32 @@ arc_do_user_evicts(void)
  }
  
  /*
+ * Evict only meta data objects from the cache leaving the data objects.
+ * This is only used to enforce the tunable arc_meta_limit, if we are
+ * unable to evict enough buffers notify the user via the prune callback.
+ */
+void
+arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
+{
+       int64_t delta;
+
+       if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+               delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+               arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
+               adjustment -= delta;
+       }
+
+       if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+               delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+               arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
+               adjustment -= delta;
+       }
+
+       if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
+               arc_do_user_prune(arc_meta_prune);
+}
+
+/*
   * Flush all *evictable* data from the cache for the given spa.
   * NOTE: this will not touch "active" (i.e. referenced) data.
   */
@@ -1825,7 +2058,7 @@ arc_flush(spa_t *spa)
         uint64_t guid = 0;
  
         if (spa)
-               guid = spa_guid(spa);
+               guid = spa_load_guid(spa);
  
         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
@@ -1858,16 +2091,13 @@ arc_flush(spa_t *spa)
  }
  
  void
-arc_shrink(void)
+arc_shrink(uint64_t bytes)
  {
         if (arc_c > arc_c_min) {
                 uint64_t to_free;
  
-#ifdef _KERNEL
-               to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
-#else
-               to_free = arc_c >> arc_shrink_shift;
-#endif
+               to_free = bytes ? bytes : arc_c >> arc_shrink_shift;
+
                 if (arc_c > arc_c_min + to_free)
                         atomic_add_64(&arc_c, -to_free);
                 else
@@ -1886,67 +2116,8 @@ arc_shrink(void)
                 arc_adjust();
  }
  
-static int
-arc_reclaim_needed(void)
-{
-       uint64_t extra;
-
-#ifdef _KERNEL
-
-       if (needfree)
-               return (1);
-
-       /*
-        * take 'desfree' extra pages, so we reclaim sooner, rather than later
-        */
-       extra = desfree;
-
-       /*
-        * check that we're out of range of the pageout scanner.  It starts to
-        * schedule paging if freemem is less than lotsfree and needfree.
-        * lotsfree is the high-water mark for pageout, and needfree is the
-        * number of needed free pages.  We add extra pages here to make sure
-        * the scanner doesn't start up while we're freeing memory.
-        */
-       if (freemem < lotsfree + needfree + extra)
-               return (1);
-
-       /*
-        * check to make sure that swapfs has enough space so that anon
-        * reservations can still succeed. anon_resvmem() checks that the
-        * availrmem is greater than swapfs_minfree, and the number of reserved
-        * swap pages.  We also add a bit of extra here just to prevent
-        * circumstances from getting really dire.
-        */
-       if (availrmem < swapfs_minfree + swapfs_reserve + extra)
-               return (1);
-
-#if defined(__i386)
-       /*
-        * If we're on an i386 platform, it's possible that we'll exhaust the
-        * kernel heap space before we ever run out of available physical
-        * memory.  Most checks of the size of the heap_area compare against
-        * tune.t_minarmem, which is the minimum available real memory that we
-        * can have in the system.  However, this is generally fixed at 25 pages
-        * which is so low that it's useless.  In this comparison, we seek to
-        * calculate the total heap-size, and reclaim if more than 3/4ths of the
-        * heap is allocated.  (Or, in the calculation, if less than 1/4th is
-        * free)
-        */
-       if (btop(vmem_size(heap_arena, VMEM_FREE)) <
-           (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
-               return (1);
-#endif
-
-#else
-       if (spa_get_random(100) == 0)
-               return (1);
-#endif
-       return (0);
-}
-
  static void
-arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
  {
         size_t                  i;
         kmem_cache_t            *prev_cache = NULL;
@@ -1954,28 +2125,12 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
         extern kmem_cache_t     *zio_buf_cache[];
         extern kmem_cache_t     *zio_data_buf_cache[];
  
-#ifdef _KERNEL
-       if (arc_meta_used >= arc_meta_limit) {
-               /*
-                * We are exceeding our meta-data cache limit.
-                * Purge some DNLC entries to release holds on meta-data.
-                */
-               dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
-       }
-#if defined(__i386)
-       /*
-        * Reclaim unused memory from all kmem caches.
-        */
-       kmem_reap();
-#endif
-#endif
-
         /*
          * An aggressive reclamation will shrink the cache size as well as
          * reap free buffers from the arc kmem caches.
          */
         if (strat == ARC_RECLAIM_AGGR)
-               arc_shrink();
+               arc_shrink(bytes);
  
         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
                 if (zio_buf_cache[i] != prev_cache) {
@@ -1987,22 +2142,32 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
                         kmem_cache_reap_now(zio_data_buf_cache[i]);
                 }
         }
+
         kmem_cache_reap_now(buf_cache);
         kmem_cache_reap_now(hdr_cache);
  }
  
+/*
+ * Unlike other ZFS implementations this thread is only responsible for
+ * adapting the target ARC size on Linux.  The responsibility for memory
+ * reclamation has been entirely delegated to the arc_shrinker_func()
+ * which is registered with the VM.  To reflect this change in behavior
+ * the arc_reclaim thread has been renamed to arc_adapt.
+ */
  static void
-arc_reclaim_thread(void)
+arc_adapt_thread(void)
  {
-       clock_t                 growtime = 0;
-       arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
         callb_cpr_t             cpr;
+       int64_t                 prune;
  
         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
  
         mutex_enter(&arc_reclaim_thr_lock);
         while (arc_thread_exit == 0) {
-               if (arc_reclaim_needed()) {
+#ifndef _KERNEL
+               arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
+
+               if (spa_get_random(100) == 0) {
  
                         if (arc_no_grow) {
                                 if (last_reclaim == ARC_RECLAIM_CONS) {
@@ -2017,26 +2182,35 @@ arc_reclaim_thread(void)
                         }
  
                         /* reset the growth delay for every reclaim */
-                       growtime = lbolt + (arc_grow_retry * hz);
+                       arc_grow_time = ddi_get_lbolt()+(arc_grow_retry * hz);
  
-                       arc_kmem_reap_now(last_reclaim);
+                       arc_kmem_reap_now(last_reclaim, 0);
                         arc_warm = B_TRUE;
+               }
+#endif /* !_KERNEL */
  
-               } else if (arc_no_grow && lbolt >= growtime) {
+               /* No recent memory pressure allow the ARC to grow. */
+               if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
                         arc_no_grow = FALSE;
-               }
  
-               if (2 * arc_c < arc_size +
-                   arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
-                       arc_adjust();
+               /*
+                * Keep meta data usage within limits, arc_shrink() is not
+                * used to avoid collapsing the arc_c value when only the
+                * arc_meta_limit is being exceeded.
+                */
+               prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
+               if (prune > 0)
+                       arc_adjust_meta(prune, B_TRUE);
+
+               arc_adjust();
  
                 if (arc_eviction_list != NULL)
                         arc_do_user_evicts();
  
                 /* block until needed, or one second, whichever is shorter */
                 CALLB_CPR_SAFE_BEGIN(&cpr);
-               (void) cv_timedwait(&arc_reclaim_thr_cv,
-                   &arc_reclaim_thr_lock, (lbolt + hz));
+               (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
+                   &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
         }
  
@@ -2046,6 +2220,131 @@ arc_reclaim_thread(void)
         thread_exit();
  }
  
+#ifdef _KERNEL
+/*
+ * Determine the amount of memory eligible for eviction contained in the
+ * ARC. All clean data reported by the ghost lists can always be safely
+ * evicted. Due to arc_c_min, the same does not hold for all clean data
+ * contained by the regular mru and mfu lists.
+ *
+ * In the case of the regular mru and mfu lists, we need to report as
+ * much clean data as possible, such that evicting that same reported
+ * data will not bring arc_size below arc_c_min. Thus, in certain
+ * circumstances, the total amount of clean data in the mru and mfu
+ * lists might not actually be evictable.
+ *
+ * The following two distinct cases are accounted for:
+ *
+ * 1. The sum of the amount of dirty data contained by both the mru and
+ *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ *    is greater than or equal to arc_c_min.
+ *    (i.e. amount of dirty data >= arc_c_min)
+ *
+ *    This is the easy case; all clean data contained by the mru and mfu
+ *    lists is evictable. Evicting all clean data can only drop arc_size
+ *    to the amount of dirty data, which is greater than arc_c_min.
+ *
+ * 2. The sum of the amount of dirty data contained by both the mru and
+ *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ *    is less than arc_c_min.
+ *    (i.e. arc_c_min > amount of dirty data)
+ *
+ *    2.1. arc_size is greater than or equal arc_c_min.
+ *         (i.e. arc_size >= arc_c_min > amount of dirty data)
+ *
+ *         In this case, not all clean data from the regular mru and mfu
+ *         lists is actually evictable; we must leave enough clean data
+ *         to keep arc_size above arc_c_min. Thus, the maximum amount of
+ *         evictable data from the two lists combined, is exactly the
+ *         difference between arc_size and arc_c_min.
+ *
+ *    2.2. arc_size is less than arc_c_min
+ *         (i.e. arc_c_min > arc_size > amount of dirty data)
+ *
+ *         In this case, none of the data contained in the mru and mfu
+ *         lists is evictable, even if it's clean. Since arc_size is
+ *         already below arc_c_min, evicting any more would only
+ *         increase this negative difference.
+ */
+static uint64_t
+arc_evictable_memory(void) {
+       uint64_t arc_clean =
+           arc_mru->arcs_lsize[ARC_BUFC_DATA] +
+           arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+           arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
+           arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
+       uint64_t ghost_clean =
+           arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
+           arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
+           arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
+           arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
+       uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
+
+       if (arc_dirty >= arc_c_min)
+               return (ghost_clean + arc_clean);
+
+       return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
+}
+
+static int
+__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
+{
+       uint64_t pages;
+
+       /* The arc is considered warm once reclaim has occurred */
+       if (unlikely(arc_warm == B_FALSE))
+               arc_warm = B_TRUE;
+
+       /* Return the potential number of reclaimable pages */
+       pages = btop(arc_evictable_memory());
+       if (sc->nr_to_scan == 0)
+               return (pages);
+
+       /* Not allowed to perform filesystem reclaim */
+       if (!(sc->gfp_mask & __GFP_FS))
+               return (-1);
+
+       /* Reclaim in progress */
+       if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
+               return (-1);
+
+       /*
+        * Evict the requested number of pages by shrinking arc_c the
+        * requested amount.  If there is nothing left to evict just
+        * reap whatever we can from the various arc slabs.
+        */
+       if (pages > 0) {
+               arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
+               pages = btop(arc_evictable_memory());
+       } else {
+               arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
+               pages = -1;
+       }
+
+       /*
+        * When direct reclaim is observed it usually indicates a rapid
+        * increase in memory pressure.  This occurs because the kswapd
+        * threads were unable to asynchronously keep enough free memory
+        * available.  In this case set arc_no_grow to briefly pause arc
+        * growth to avoid compounding the memory pressure.
+        */
+       if (current_is_kswapd()) {
+               ARCSTAT_BUMP(arcstat_memory_indirect_count);
+       } else {
+               arc_no_grow = B_TRUE;
+               arc_grow_time = ddi_get_lbolt() + (arc_grow_retry * hz);
+               ARCSTAT_BUMP(arcstat_memory_direct_count);
+       }
+
+       mutex_exit(&arc_reclaim_thr_lock);
+
+       return (pages);
+}
+SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
+
+SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
+#endif /* _KERNEL */
+
  /*
   * Adapt arc info given the number of bytes we are trying to add and
   * the state that we are comming from.  This function is only called
@@ -2072,6 +2371,7 @@ arc_adapt(int bytes, arc_state_t *state)
         if (state == arc_mru_ghost) {
                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+               mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
  
                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
         } else if (state == arc_mfu_ghost) {
@@ -2079,17 +2379,13 @@ arc_adapt(int bytes, arc_state_t *state)
  
                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+               mult = MIN(mult, 10);
  
                 delta = MIN(bytes * mult, arc_p);
                 arc_p = MAX(arc_p_min, arc_p - delta);
         }
         ASSERT((int64_t)arc_p >= 0);
  
-       if (arc_reclaim_needed()) {
-               cv_signal(&arc_reclaim_thr_cv);
-               return;
-       }
-
         if (arc_no_grow)
                 return;
  
@@ -2134,7 +2430,7 @@ arc_evict_needed(arc_buf_contents_t type)
                 return (1);
  #endif
  
-       if (arc_reclaim_needed())
+       if (arc_no_grow)
                 return (1);
  
         return (arc_size > arc_c);
@@ -2208,16 +2504,27 @@ arc_get_data_buf(arc_buf_t *buf)
                 state =  (arc_mru->arcs_lsize[type] >= size &&
                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
         }
-       if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
+
+       if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
                 if (type == ARC_BUFC_METADATA) {
                         buf->b_data = zio_buf_alloc(size);
                         arc_space_consume(size, ARC_SPACE_DATA);
+
+                       /*
+                        * If we are unable to recycle an existing meta buffer
+                        * signal the reclaim thread.  It will notify users
+                        * via the prune callback to drop references.  The
+                        * prune callback in run in the context of the reclaim
+                        * thread to avoid deadlocking on the hash_lock.
+                        */
+                       cv_signal(&arc_reclaim_thr_cv);
                 } else {
                         ASSERT(type == ARC_BUFC_DATA);
                         buf->b_data = zio_data_buf_alloc(size);
                         ARCSTAT_INCR(arcstat_data_size, size);
                         atomic_add_64(&arc_size, size);
                 }
+
                 ARCSTAT_BUMP(arcstat_recycle_miss);
         }
         ASSERT(buf->b_data != NULL);
@@ -2251,6 +2558,8 @@ out:
  static void
  arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
  {
+       clock_t now;
+
         ASSERT(MUTEX_HELD(hash_lock));
  
         if (buf->b_state == arc_anon) {
@@ -2261,11 +2570,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                  */
  
                 ASSERT(buf->b_arc_access == 0);
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
                 arc_change_state(arc_mru, buf, hash_lock);
  
         } else if (buf->b_state == arc_mru) {
+               now = ddi_get_lbolt();
+
                 /*
                  * If this buffer is here because of a prefetch, then either:
                  * - clear the flag if this is a "referencing" read
@@ -2281,7 +2592,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                                 buf->b_flags &= ~ARC_PREFETCH;
                                 ARCSTAT_BUMP(arcstat_mru_hits);
                         }
-                       buf->b_arc_access = lbolt;
+                       buf->b_arc_access = now;
                         return;
                 }
  
@@ -2290,13 +2601,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                  * but it is still in the cache. Move it to the MFU
                  * state.
                  */
-               if (lbolt > buf->b_arc_access + ARC_MINTIME) {
+               if (now > buf->b_arc_access + ARC_MINTIME) {
                         /*
                          * More than 125ms have passed since we
                          * instantiated this buffer.  Move it to the
                          * most frequently used state.
                          */
-                       buf->b_arc_access = lbolt;
+                       buf->b_arc_access = now;
                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                         arc_change_state(arc_mfu, buf, hash_lock);
                 }
@@ -2319,7 +2630,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                 }
  
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
                 arc_change_state(new_state, buf, hash_lock);
  
                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
@@ -2338,7 +2649,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                         ASSERT(list_link_active(&buf->b_arc_node));
                 }
                 ARCSTAT_BUMP(arcstat_mfu_hits);
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
         } else if (buf->b_state == arc_mfu_ghost) {
                 arc_state_t     *new_state = arc_mfu;
                 /*
@@ -2356,7 +2667,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                         new_state = arc_mru;
                 }
  
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                 arc_change_state(new_state, buf, hash_lock);
  
@@ -2366,7 +2677,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                  * This buffer is on the 2nd Level ARC.
                  */
  
-               buf->b_arc_access = lbolt;
+               buf->b_arc_access = ddi_get_lbolt();
                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                 arc_change_state(arc_mfu, buf, hash_lock);
         } else {
@@ -2379,7 +2690,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
  void
  arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
  {
-       bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+       if (zio == NULL || zio->io_error == 0)
+               bcopy(buf->b_data, arg, buf->b_hdr->b_size);
         VERIFY(arc_buf_remove_ref(buf, arg) == 1);
  }
  
@@ -2393,6 +2705,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
                 *bufp = NULL;
         } else {
                 *bufp = buf;
+               ASSERT(buf->b_data);
         }
  }
  
@@ -2431,7 +2744,7 @@ arc_read_done(zio_t *zio)
         /* byteswap if necessary */
         callback_list = hdr->b_acb;
         ASSERT(callback_list != NULL);
-       if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+       if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
                 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
                     byteswap_uint64_array :
                     dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
@@ -2440,6 +2753,16 @@ arc_read_done(zio_t *zio)
  
         arc_cksum_compute(buf, B_FALSE);
  
+       if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+               /*
+                * Only call arc_access on anonymous buffers.  This is because
+                * if we've issued an I/O for an evicted buffer, we've already
+                * called arc_access (to prevent any simultaneous readers from
+                * getting confused).
+                */
+               arc_access(hdr, hash_lock);
+       }
+
         /* create copies of the data buffer for the callers */
         abuf = buf;
         for (acb = callback_list; acb; acb = acb->acb_next) {
@@ -2453,8 +2776,11 @@ arc_read_done(zio_t *zio)
         hdr->b_acb = NULL;
         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
         ASSERT(!HDR_BUF_AVAILABLE(hdr));
-       if (abuf == buf)
+       if (abuf == buf) {
+               ASSERT(buf->b_efunc == NULL);
+               ASSERT(hdr->b_datacnt == 1);
                 hdr->b_flags |= ARC_BUF_AVAILABLE;
+       }
  
         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
  
@@ -2475,14 +2801,6 @@ arc_read_done(zio_t *zio)
         cv_broadcast(&hdr->b_cv);
  
         if (hash_lock) {
-               /*
-                * Only call arc_access on anonymous buffers.  This is because
-                * if we've issued an I/O for an evicted buffer, we've already
-                * called arc_access (to prevent any simultaneous readers from
-                * getting confused).
-                */
-               if (zio->io_error == 0 && hdr->b_state == arc_anon)
-                       arc_access(hdr, hash_lock);
                 mutex_exit(hash_lock);
         } else {
                 /*
@@ -2536,36 +2854,46 @@ arc_read_done(zio_t *zio)
   * arc_read_bp.
   */
  int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
      arc_done_func_t *done, void *private, int priority, int zio_flags,
      uint32_t *arc_flags, const zbookmark_t *zb)
  {
         int err;
  
+       if (pbuf == NULL) {
+               /*
+                * XXX This happens from traverse callback funcs, for
+                * the objset_phys_t block.
+                */
+               return (arc_read_nolock(pio, spa, bp, done, private, priority,
+                   zio_flags, arc_flags, zb));
+       }
+
         ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
         ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
-       rw_enter(&pbuf->b_lock, RW_READER);
+       rw_enter(&pbuf->b_data_lock, RW_READER);
  
         err = arc_read_nolock(pio, spa, bp, done, private, priority,
             zio_flags, arc_flags, zb);
-       rw_exit(&pbuf->b_lock);
+       rw_exit(&pbuf->b_data_lock);
  
         return (err);
  }
  
  int
-arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
      arc_done_func_t *done, void *private, int priority, int zio_flags,
      uint32_t *arc_flags, const zbookmark_t *zb)
  {
         arc_buf_hdr_t *hdr;
-       arc_buf_t *buf;
+       arc_buf_t *buf = NULL;
         kmutex_t *hash_lock;
         zio_t *rzio;
-       uint64_t guid = spa_guid(spa);
+       uint64_t guid = spa_load_guid(spa);
  
  top:
-       hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+       hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+           &hash_lock);
         if (hdr && hdr->b_datacnt > 0) {
  
                 *arc_flags |= ARC_CACHED;
@@ -2583,7 +2911,7 @@ top:
                                 arc_callback_t  *acb = NULL;
  
                                 acb = kmem_zalloc(sizeof (arc_callback_t),
-                                   KM_SLEEP);
+                                   KM_PUSHPAGE);
                                 acb->acb_done = done;
                                 acb->acb_private = private;
                                 if (pio != NULL)
@@ -2619,6 +2947,7 @@ top:
                         } else {
                                 buf = arc_buf_clone(buf);
                         }
+
                 } else if (*arc_flags & ARC_PREFETCH &&
                     refcount_count(&hdr->b_refcnt) == 0) {
                         hdr->b_flags |= ARC_PREFETCH;
@@ -2639,7 +2968,7 @@ top:
                 uint64_t size = BP_GET_LSIZE(bp);
                 arc_callback_t  *acb;
                 vdev_t *vd = NULL;
-               uint64_t addr;
+               uint64_t addr = -1;
                 boolean_t devw = B_FALSE;
  
                 if (hdr == NULL) {
@@ -2649,15 +2978,13 @@ top:
                         buf = arc_buf_alloc(spa, size, private, type);
                         hdr = buf->b_hdr;
                         hdr->b_dva = *BP_IDENTITY(bp);
-                       hdr->b_birth = bp->blk_birth;
+                       hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
                         exists = buf_hash_insert(hdr, &hash_lock);
                         if (exists) {
                                 /* somebody beat us to the hash insert */
                                 mutex_exit(hash_lock);
-                               bzero(&hdr->b_dva, sizeof (dva_t));
-                               hdr->b_birth = 0;
-                               hdr->b_cksum0 = 0;
+                               buf_discard_identity(hdr);
                                 (void) arc_buf_remove_ref(buf, private);
                                 goto top; /* restart the IO request */
                         }
@@ -2692,13 +3019,15 @@ top:
                         buf->b_private = NULL;
                         buf->b_next = NULL;
                         hdr->b_buf = buf;
-                       arc_get_data_buf(buf);
                         ASSERT(hdr->b_datacnt == 0);
                         hdr->b_datacnt = 1;
-
+                       arc_get_data_buf(buf);
+                       arc_access(hdr, hash_lock);
                 }
  
-               acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+               ASSERT(!GHOST_STATE(hdr->b_state));
+
+               acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE);
                 acb->acb_done = done;
                 acb->acb_private = private;
  
@@ -2706,17 +3035,6 @@ top:
                 hdr->b_acb = acb;
                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
  
-               /*
-                * If the buffer has been evicted, migrate it to a present state
-                * before issuing the I/O.  Once we drop the hash-table lock,
-                * the header will be marked as I/O in progress and have an
-                * attached buffer.  At this point, anybody who finds this
-                * buffer ought to notice that it's legit but has a pending I/O.
-                */
-
-               if (GHOST_STATE(hdr->b_state))
-                       arc_access(hdr, hash_lock);
-
                 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
@@ -2732,8 +3050,8 @@ top:
                 mutex_exit(hash_lock);
  
                 ASSERT3U(hdr->b_size, ==, size);
-               DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
-                   zbookmark_t *, zb);
+               DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
+                   uint64_t, size, zbookmark_t *, zb);
                 ARCSTAT_BUMP(arcstat_misses);
                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
@@ -2758,7 +3076,7 @@ top:
                                 ARCSTAT_BUMP(arcstat_l2_hits);
  
                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
-                                   KM_SLEEP);
+                                   KM_PUSHPAGE);
                                 cb->l2rcb_buf = buf;
                                 cb->l2rcb_spa = spa;
                                 cb->l2rcb_bp = *bp;
@@ -2819,39 +3137,35 @@ top:
         return (0);
  }
  
-/*
- * arc_read() variant to support pool traversal.  If the block is already
- * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
- * The idea is that we don't want pool traversal filling up memory, but
- * if the ARC already has the data anyway, we shouldn't pay for the I/O.
- */
-int
-arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
+arc_prune_t *
+arc_add_prune_callback(arc_prune_func_t *func, void *private)
  {
-       arc_buf_hdr_t *hdr;
-       kmutex_t *hash_mtx;
-       uint64_t guid = spa_guid(spa);
-       int rc = 0;
-
-       hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
+       arc_prune_t *p;
  
-       if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
-               arc_buf_t *buf = hdr->b_buf;
+       p = kmem_alloc(sizeof(*p), KM_SLEEP);
+       p->p_pfunc = func;
+       p->p_private = private;
+       list_link_init(&p->p_node);
+       refcount_create(&p->p_refcnt);
  
-               ASSERT(buf);
-               while (buf->b_data == NULL) {
-                       buf = buf->b_next;
-                       ASSERT(buf);
-               }
-               bcopy(buf->b_data, data, hdr->b_size);
-       } else {
-               rc = ENOENT;
-       }
+       mutex_enter(&arc_prune_mtx);
+       refcount_add(&p->p_refcnt, &arc_prune_list);
+       list_insert_head(&arc_prune_list, p);
+       mutex_exit(&arc_prune_mtx);
  
-       if (hash_mtx)
-               mutex_exit(hash_mtx);
+       return (p);
+}
  
-       return (rc);
+void
+arc_remove_prune_callback(arc_prune_t *p)
+{
+       mutex_enter(&arc_prune_mtx);
+       list_remove(&arc_prune_list, p);
+       if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
+               refcount_destroy(&p->p_refcnt);
+               kmem_free(p, sizeof (*p));
+       }
+       mutex_exit(&arc_prune_mtx);
  }
  
  void
@@ -2860,6 +3174,9 @@ arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
         ASSERT(buf->b_hdr != NULL);
         ASSERT(buf->b_hdr->b_state != arc_anon);
         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+       ASSERT(buf->b_efunc == NULL);
+       ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
+
         buf->b_efunc = func;
         buf->b_private = private;
  }
@@ -2876,14 +3193,14 @@ arc_buf_evict(arc_buf_t *buf)
         kmutex_t *hash_lock;
         arc_buf_t **bufp;
  
-       rw_enter(&buf->b_lock, RW_WRITER);
+       mutex_enter(&buf->b_evict_lock);
         hdr = buf->b_hdr;
         if (hdr == NULL) {
                 /*
                  * We are in arc_do_user_evicts().
                  */
                 ASSERT(buf->b_data == NULL);
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                 return (0);
         } else if (buf->b_data == NULL) {
                 arc_buf_t copy = *buf; /* structure assignment */
@@ -2892,14 +3209,15 @@ arc_buf_evict(arc_buf_t *buf)
                  * but let arc_do_user_evicts() do the reaping.
                  */
                 buf->b_efunc = NULL;
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                 VERIFY(copy.b_efunc(&copy) == 0);
                 return (1);
         }
         hash_lock = HDR_LOCK(hdr);
         mutex_enter(hash_lock);
+       hdr = buf->b_hdr;
+       ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
  
-       ASSERT(buf->b_hdr == hdr);
         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
  
@@ -2918,6 +3236,7 @@ arc_buf_evict(arc_buf_t *buf)
                 arc_state_t *old_state = hdr->b_state;
                 arc_state_t *evicted_state;
  
+               ASSERT(hdr->b_buf == NULL);
                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
  
                 evicted_state =
@@ -2935,12 +3254,13 @@ arc_buf_evict(arc_buf_t *buf)
                 mutex_exit(&old_state->arcs_mtx);
         }
         mutex_exit(hash_lock);
-       rw_exit(&buf->b_lock);
+       mutex_exit(&buf->b_evict_lock);
  
         VERIFY(buf->b_efunc(buf) == 0);
         buf->b_efunc = NULL;
         buf->b_private = NULL;
         buf->b_hdr = NULL;
+       buf->b_next = NULL;
         kmem_cache_free(buf_cache, buf);
         return (1);
  }
@@ -2955,29 +3275,30 @@ void
  arc_release(arc_buf_t *buf, void *tag)
  {
         arc_buf_hdr_t *hdr;
-       kmutex_t *hash_lock;
+       kmutex_t *hash_lock = NULL;
         l2arc_buf_hdr_t *l2hdr;
-       uint64_t buf_size;
-       boolean_t released = B_FALSE;
+       uint64_t buf_size = 0;
  
-       rw_enter(&buf->b_lock, RW_WRITER);
+       /*
+        * It would be nice to assert that if it's DMU metadata (level >
+        * 0 || it's the dnode file), then it must be syncing context.
+        * But we don't know that information at this level.
+        */
+
+       mutex_enter(&buf->b_evict_lock);
         hdr = buf->b_hdr;
  
         /* this buffer is not on any list */
         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
-       ASSERT(!(hdr->b_flags & ARC_STORED));
  
         if (hdr->b_state == arc_anon) {
                 /* this buffer is already released */
-               ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
-               ASSERT(BUF_EMPTY(hdr));
                 ASSERT(buf->b_efunc == NULL);
-               arc_buf_thaw(buf);
-               rw_exit(&buf->b_lock);
-               released = B_TRUE;
         } else {
                 hash_lock = HDR_LOCK(hdr);
                 mutex_enter(hash_lock);
+               hdr = buf->b_hdr;
+               ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
         }
  
         l2hdr = hdr->b_l2hdr;
@@ -2987,9 +3308,6 @@ arc_release(arc_buf_t *buf, void *tag)
                 buf_size = hdr->b_size;
         }
  
-       if (released)
-               goto out;
-
         /*
          * Do we have more than one buf?
          */
@@ -3003,14 +3321,14 @@ arc_release(arc_buf_t *buf, void *tag)
  
                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
                 /*
-                * Pull the data off of this buf and attach it to
-                * a new anonymous buf.
+                * Pull the data off of this hdr and attach it to
+                * a new anonymous hdr.
                  */
                 (void) remove_reference(hdr, hash_lock, tag);
                 bufp = &hdr->b_buf;
                 while (*bufp != buf)
                         bufp = &(*bufp)->b_next;
-               *bufp = (*bufp)->b_next;
+               *bufp = buf->b_next;
                 buf->b_next = NULL;
  
                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
@@ -3038,26 +3356,25 @@ arc_release(arc_buf_t *buf, void *tag)
                 nhdr->b_freeze_cksum = NULL;
                 (void) refcount_add(&nhdr->b_refcnt, tag);
                 buf->b_hdr = nhdr;
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                 atomic_add_64(&arc_anon->arcs_size, blksz);
         } else {
-               rw_exit(&buf->b_lock);
+               mutex_exit(&buf->b_evict_lock);
                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
                 ASSERT(!list_link_active(&hdr->b_arc_node));
                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-               arc_change_state(arc_anon, hdr, hash_lock);
+               if (hdr->b_state != arc_anon)
+                       arc_change_state(arc_anon, hdr, hash_lock);
                 hdr->b_arc_access = 0;
-               mutex_exit(hash_lock);
+               if (hash_lock)
+                       mutex_exit(hash_lock);
  
-               bzero(&hdr->b_dva, sizeof (dva_t));
-               hdr->b_birth = 0;
-               hdr->b_cksum0 = 0;
+               buf_discard_identity(hdr);
                 arc_buf_thaw(buf);
         }
         buf->b_efunc = NULL;
         buf->b_private = NULL;
  
-out:
         if (l2hdr) {
                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
@@ -3066,14 +3383,27 @@ out:
         }
  }
  
+/*
+ * Release this buffer.  If it does not match the provided BP, fill it
+ * with that block's contents.
+ */
+/* ARGSUSED */
+int
+arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+    zbookmark_t *zb)
+{
+       arc_release(buf, tag);
+       return (0);
+}
+
  int
  arc_released(arc_buf_t *buf)
  {
         int released;
  
-       rw_enter(&buf->b_lock, RW_READER);
+       mutex_enter(&buf->b_evict_lock);
         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
-       rw_exit(&buf->b_lock);
+       mutex_exit(&buf->b_evict_lock);
         return (released);
  }
  
@@ -3082,9 +3412,9 @@ arc_has_callback(arc_buf_t *buf)
  {
         int callback;
  
-       rw_enter(&buf->b_lock, RW_READER);
+       mutex_enter(&buf->b_evict_lock);
         callback = (buf->b_efunc != NULL);
-       rw_exit(&buf->b_lock);
+       mutex_exit(&buf->b_evict_lock);
         return (callback);
  }
  
@@ -3094,9 +3424,9 @@ arc_referenced(arc_buf_t *buf)
  {
         int referenced;
  
-       rw_enter(&buf->b_lock, RW_READER);
+       mutex_enter(&buf->b_evict_lock);
         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
-       rw_exit(&buf->b_lock);
+       mutex_exit(&buf->b_evict_lock);
         return (referenced);
  }
  #endif
@@ -3136,21 +3466,28 @@ arc_write_done(zio_t *zio)
         arc_buf_t *buf = callback->awcb_buf;
         arc_buf_hdr_t *hdr = buf->b_hdr;
  
-       hdr->b_acb = NULL;
+       ASSERT(hdr->b_acb == NULL);
+
+       if (zio->io_error == 0) {
+               hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+               hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+               hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+       } else {
+               ASSERT(BUF_EMPTY(hdr));
+       }
  
-       hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-       hdr->b_birth = zio->io_bp->blk_birth;
-       hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
         /*
          * If the block to be written was all-zero, we may have
          * compressed it away.  In this case no write was performed
-        * so there will be no dva/birth-date/checksum.  The buffer
-        * must therefor remain anonymous (and uncached).
+        * so there will be no dva/birth/checksum.  The buffer must
+        * therefore remain anonymous (and uncached).
          */
         if (!BUF_EMPTY(hdr)) {
                 arc_buf_hdr_t *exists;
                 kmutex_t *hash_lock;
  
+               ASSERT(zio->io_error == 0);
+
                 arc_cksum_verify(buf);
  
                 exists = buf_hash_insert(hdr, &hash_lock);
@@ -3160,106 +3497,54 @@ arc_write_done(zio_t *zio)
                          * sync-to-convergence, because we remove
                          * buffers from the hash table when we arc_free().
                          */
-                       ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
-                       ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
-                           BP_IDENTITY(zio->io_bp)));
-                       ASSERT3U(zio->io_bp_orig.blk_birth, ==,
-                           zio->io_bp->blk_birth);
-
-                       ASSERT(refcount_is_zero(&exists->b_refcnt));
-                       arc_change_state(arc_anon, exists, hash_lock);
-                       mutex_exit(hash_lock);
-                       arc_hdr_destroy(exists);
-                       exists = buf_hash_insert(hdr, &hash_lock);
-                       ASSERT3P(exists, ==, NULL);
+                       if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+                               if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+                                       panic("bad overwrite, hdr=%p exists=%p",
+                                           (void *)hdr, (void *)exists);
+                               ASSERT(refcount_is_zero(&exists->b_refcnt));
+                               arc_change_state(arc_anon, exists, hash_lock);
+                               mutex_exit(hash_lock);
+                               arc_hdr_destroy(exists);
+                               exists = buf_hash_insert(hdr, &hash_lock);
+                               ASSERT3P(exists, ==, NULL);
+                       } else {
+                               /* Dedup */
+                               ASSERT(hdr->b_datacnt == 1);
+                               ASSERT(hdr->b_state == arc_anon);
+                               ASSERT(BP_GET_DEDUP(zio->io_bp));
+                               ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+                       }
                 }
                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
                 /* if it's not anon, we are doing a scrub */
-               if (hdr->b_state == arc_anon)
+               if (!exists && hdr->b_state == arc_anon)
                         arc_access(hdr, hash_lock);
                 mutex_exit(hash_lock);
-       } else if (callback->awcb_done == NULL) {
-               int destroy_hdr;
-               /*
-                * This is an anonymous buffer with no user callback,
-                * destroy it if there are no active references.
-                */
-               mutex_enter(&arc_eviction_mtx);
-               destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
-               hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-               mutex_exit(&arc_eviction_mtx);
-               if (destroy_hdr)
-                       arc_hdr_destroy(hdr);
         } else {
                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
         }
-       hdr->b_flags &= ~ARC_STORED;
  
-       if (callback->awcb_done) {
-               ASSERT(!refcount_is_zero(&hdr->b_refcnt));
-               callback->awcb_done(zio, buf, callback->awcb_private);
-       }
+       ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+       callback->awcb_done(zio, buf, callback->awcb_private);
  
         kmem_free(callback, sizeof (arc_write_callback_t));
  }
  
-void
-write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
-{
-       boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
-
-       /* Determine checksum setting */
-       if (ismd) {
-               /*
-                * Metadata always gets checksummed.  If the data
-                * checksum is multi-bit correctable, and it's not a
-                * ZBT-style checksum, then it's suitable for metadata
-                * as well.  Otherwise, the metadata checksum defaults
-                * to fletcher4.
-                */
-               if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
-                   !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
-                       zp->zp_checksum = wp->wp_oschecksum;
-               else
-                       zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
-       } else {
-               zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
-                   wp->wp_oschecksum);
-       }
-
-       /* Determine compression setting */
-       if (ismd) {
-               /*
-                * XXX -- we should design a compression algorithm
-                * that specializes in arrays of bps.
-                */
-               zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
-                   ZIO_COMPRESS_LZJB;
-       } else {
-               zp->zp_compress = zio_compress_select(wp->wp_dncompress,
-                   wp->wp_oscompress);
-       }
-
-       zp->zp_type = wp->wp_type;
-       zp->zp_level = wp->wp_level;
-       zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
-}
-
  zio_t *
-arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
-    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int zio_flags, const zbookmark_t *zb)
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+    arc_done_func_t *ready, arc_done_func_t *done, void *private,
+    int priority, int zio_flags, const zbookmark_t *zb)
  {
         arc_buf_hdr_t *hdr = buf->b_hdr;
         arc_write_callback_t *callback;
         zio_t *zio;
-       zio_prop_t zp;
  
         ASSERT(ready != NULL);
+       ASSERT(done != NULL);
         ASSERT(!HDR_IO_ERROR(hdr));
         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
-       ASSERT(hdr->b_acb == 0);
+       ASSERT(hdr->b_acb == NULL);
         if (l2arc)
                 hdr->b_flags |= ARC_L2CACHE;
         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
@@ -3268,136 +3553,34 @@ arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
         callback->awcb_private = private;
         callback->awcb_buf = buf;
  
-       write_policy(spa, wp, &zp);
-       zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+       zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
             arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
  
         return (zio);
  }
  
-int
-arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags)
-{
-       arc_buf_hdr_t *ab;
-       kmutex_t *hash_lock;
-       zio_t   *zio;
-       uint64_t guid = spa_guid(spa);
-
-       /*
-        * If this buffer is in the cache, release it, so it
-        * can be re-used.
-        */
-       ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
-       if (ab != NULL) {
-               /*
-                * The checksum of blocks to free is not always
-                * preserved (eg. on the deadlist).  However, if it is
-                * nonzero, it should match what we have in the cache.
-                */
-               ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
-                   bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
-                   bp->blk_fill == BLK_FILL_ALREADY_FREED);
-
-               if (ab->b_state != arc_anon)
-                       arc_change_state(arc_anon, ab, hash_lock);
-               if (HDR_IO_IN_PROGRESS(ab)) {
-                       /*
-                        * This should only happen when we prefetch.
-                        */
-                       ASSERT(ab->b_flags & ARC_PREFETCH);
-                       ASSERT3U(ab->b_datacnt, ==, 1);
-                       ab->b_flags |= ARC_FREED_IN_READ;
-                       if (HDR_IN_HASH_TABLE(ab))
-                               buf_hash_remove(ab);
-                       ab->b_arc_access = 0;
-                       bzero(&ab->b_dva, sizeof (dva_t));
-                       ab->b_birth = 0;
-                       ab->b_cksum0 = 0;
-                       ab->b_buf->b_efunc = NULL;
-                       ab->b_buf->b_private = NULL;
-                       mutex_exit(hash_lock);
-               } else if (refcount_is_zero(&ab->b_refcnt)) {
-                       ab->b_flags |= ARC_FREE_IN_PROGRESS;
-                       mutex_exit(hash_lock);
-                       arc_hdr_destroy(ab);
-                       ARCSTAT_BUMP(arcstat_deleted);
-               } else {
-                       /*
-                        * We still have an active reference on this
-                        * buffer.  This can happen, e.g., from
-                        * dbuf_unoverride().
-                        */
-                       ASSERT(!HDR_IN_HASH_TABLE(ab));
-                       ab->b_arc_access = 0;
-                       bzero(&ab->b_dva, sizeof (dva_t));
-                       ab->b_birth = 0;
-                       ab->b_cksum0 = 0;
-                       ab->b_buf->b_efunc = NULL;
-                       ab->b_buf->b_private = NULL;
-                       mutex_exit(hash_lock);
-               }
-       }
-
-       zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
-
-       if (arc_flags & ARC_WAIT)
-               return (zio_wait(zio));
-
-       ASSERT(arc_flags & ARC_NOWAIT);
-       zio_nowait(zio);
-
-       return (0);
-}
-
  static int
  arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
  {
  #ifdef _KERNEL
-       uint64_t available_memory = ptob(freemem);
-       static uint64_t page_load = 0;
-       static uint64_t last_txg = 0;
+       uint64_t available_memory;
  
+       /* Easily reclaimable memory (free + inactive + arc-evictable) */
+       available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();
  #if defined(__i386)
         available_memory =
             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
  #endif
-       if (available_memory >= zfs_write_limit_max)
-               return (0);
  
-       if (txg > last_txg) {
-               last_txg = txg;
-               page_load = 0;
-       }
-       /*
-        * If we are in pageout, we know that memory is already tight,
-        * the arc is already going to be evicting, so we just want to
-        * continue to let page writes occur as quickly as possible.
-        */
-       if (curproc == proc_pageout) {
-               if (page_load > MAX(ptob(minfree), available_memory) / 4)
-                       return (ERESTART);
-               /* Note: reserve is inflated, so we deflate */
-               page_load += reserve / 8;
-               return (0);
-       } else if (page_load > 0 && arc_reclaim_needed()) {
-               /* memory is low, delay before restarting */
+       if (available_memory <= zfs_write_limit_max) {
                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+               DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
                 return (EAGAIN);
         }
-       page_load = 0;
-
-       if (arc_size > arc_c_min) {
-               uint64_t evictable_memory =
-                   arc_mru->arcs_lsize[ARC_BUFC_DATA] +
-                   arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
-                   arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
-                   arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
-               available_memory += MIN(evictable_memory, arc_size - arc_c_min);
-       }
  
         if (inflight_data > available_memory / 4) {
                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+               DMU_TX_STAT_BUMP(dmu_tx_memory_inflight);
                 return (ERESTART);
         }
  #endif
@@ -3428,8 +3611,10 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
  #endif
         if (reserve > arc_c/4 && !arc_no_grow)
                 arc_c = MIN(arc_c_max, reserve * 4);
-       if (reserve > arc_c)
+       if (reserve > arc_c) {
+               DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
                 return (ENOMEM);
+       }
  
         /*
          * Don't count loaned bufs as in flight dirty data to prevent long
@@ -3443,7 +3628,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
          * in order to compress/encrypt/etc the data.  We therefor need to
          * make sure that there is sufficient available memory for this.
          */
-       if (error = arc_memory_throttle(reserve, anon_size, txg))
+       if ((error = arc_memory_throttle(reserve, anon_size, txg)))
                 return (error);
  
         /*
@@ -3462,12 +3647,55 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
                     reserve>>10, arc_c>>10);
+               DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
                 return (ERESTART);
         }
         atomic_add_64(&arc_tempreserve, reserve);
         return (0);
  }
  
+static void
+arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
+    kstat_named_t *evict_data, kstat_named_t *evict_metadata)
+{
+       size->value.ui64 = state->arcs_size;
+       evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
+       evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
+}
+
+static int
+arc_kstat_update(kstat_t *ksp, int rw)
+{
+       arc_stats_t *as = ksp->ks_data;
+
+       if (rw == KSTAT_WRITE) {
+               return (EACCES);
+       } else {
+               arc_kstat_update_state(arc_anon,
+                   &as->arcstat_anon_size,
+                   &as->arcstat_anon_evict_data,
+                   &as->arcstat_anon_evict_metadata);
+               arc_kstat_update_state(arc_mru,
+                   &as->arcstat_mru_size,
+                   &as->arcstat_mru_evict_data,
+                   &as->arcstat_mru_evict_metadata);
+               arc_kstat_update_state(arc_mru_ghost,
+                   &as->arcstat_mru_ghost_size,
+                   &as->arcstat_mru_ghost_evict_data,
+                   &as->arcstat_mru_ghost_evict_metadata);
+               arc_kstat_update_state(arc_mfu,
+                   &as->arcstat_mfu_size,
+                   &as->arcstat_mfu_evict_data,
+                   &as->arcstat_mfu_evict_metadata);
+               arc_kstat_update_state(arc_mfu_ghost,
+                   &as->arcstat_mfu_ghost_size,
+                   &as->arcstat_mfu_ghost_evict_data,
+                   &as->arcstat_mfu_ghost_evict_metadata);
+       }
+
+       return (0);
+}
+
  void
  arc_init(void)
  {
@@ -3487,16 +3715,18 @@ arc_init(void)
          * need to limit the cache to 1/8 of VM size.
          */
         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+       /*
+        * Register a shrinker to support synchronous (direct) memory
+        * reclaim from the arc.  This is done to prevent kswapd from
+        * swapping out pages when it is preferable to shrink the arc.
+        */
+       spl_register_shrinker(&arc_shrinker);
  #endif
  
         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
         arc_c_min = MAX(arc_c / 4, 64<<20);
-       /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
-       if (arc_c * 8 >= 1<<30)
-               arc_c_max = (arc_c * 8) - (1<<30);
-       else
-               arc_c_max = arc_c_min;
-       arc_c_max = MAX(arc_c * 6, arc_c_max);
+       /* set max to 1/2 of all memory */
+       arc_c_max = MAX(arc_c * 4, arc_c_max);
  
         /*
          * Allow the tunables to override our calculations if they are
@@ -3512,6 +3742,7 @@ arc_init(void)
  
         /* limit meta-data to 1/4 of the arc capacity */
         arc_meta_limit = arc_c_max / 4;
+       arc_meta_max = 0;
  
         /* Allow the tunable to override if it is reasonable */
         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
@@ -3529,6 +3760,9 @@ arc_init(void)
         if (zfs_arc_p_min_shift > 0)
                 arc_p_min_shift = zfs_arc_p_min_shift;
  
+       if (zfs_arc_meta_prune > 0)
+               arc_meta_prune = zfs_arc_meta_prune;
+
         /* if kmem_flags are set, lets try to use less memory */
         if (kmem_debugging())
                 arc_c = arc_c / 2;
@@ -3574,7 +3808,10 @@ arc_init(void)
         buf_init();
  
         arc_thread_exit = 0;
+       list_create(&arc_prune_list, sizeof (arc_prune_t),
+           offsetof(arc_prune_t, p_node));
         arc_eviction_list = NULL;
+       mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
  
@@ -3583,10 +3820,11 @@ arc_init(void)
  
         if (arc_ksp != NULL) {
                 arc_ksp->ks_data = &arc_stats;
+               arc_ksp->ks_update = arc_kstat_update;
                 kstat_install(arc_ksp);
         }
  
-       (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+       (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
             TS_RUN, minclsyspri);
  
         arc_dead = FALSE;
@@ -3602,7 +3840,13 @@ arc_init(void)
  void
  arc_fini(void)
  {
+       arc_prune_t *p;
+
         mutex_enter(&arc_reclaim_thr_lock);
+#ifdef _KERNEL
+       spl_unregister_shrinker(&arc_shrinker);
+#endif /* _KERNEL */
+
         arc_thread_exit = 1;
         while (arc_thread_exit != 0)
                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
@@ -3617,6 +3861,17 @@ arc_fini(void)
                 arc_ksp = NULL;
         }
  
+       mutex_enter(&arc_prune_mtx);
+       while ((p = list_head(&arc_prune_list)) != NULL) {
+               list_remove(&arc_prune_list, p);
+               refcount_remove(&p->p_refcnt, &arc_prune_list);
+               refcount_destroy(&p->p_refcnt);
+               kmem_free(p, sizeof (*p));
+       }
+       mutex_exit(&arc_prune_mtx);
+
+       list_destroy(&arc_prune_list);
+       mutex_destroy(&arc_prune_mtx);
         mutex_destroy(&arc_eviction_mtx);
         mutex_destroy(&arc_reclaim_thr_lock);
         cv_destroy(&arc_reclaim_thr_cv);
@@ -3786,12 +4041,11 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
         /*
          * A buffer is *not* eligible for the L2ARC if it:
          * 1. belongs to a different spa.
-        * 2. has no attached buffer.
-        * 3. is already cached on the L2ARC.
-        * 4. has an I/O in progress (it may be an incomplete read).
-        * 5. is flagged not eligible (zfs property).
+        * 2. is already cached on the L2ARC.
+        * 3. has an I/O in progress (it may be an incomplete read).
+        * 4. is flagged not eligible (zfs property).
          */
-       if (ab->b_spa != spa_guid || ab->b_buf == NULL || ab->b_l2hdr != NULL ||
+       if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
                 return (B_FALSE);
  
@@ -3815,7 +4069,7 @@ l2arc_write_size(l2arc_dev_t *dev)
  static clock_t
  l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
  {
-       clock_t interval, next;
+       clock_t interval, next, now;
  
         /*
          * If the ARC lists are busy, increase our write rate; if the
@@ -3828,7 +4082,8 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
         else
                 interval = hz * l2arc_feed_secs;
  
-       next = MAX(lbolt, MIN(lbolt + interval, began + interval));
+       now = ddi_get_lbolt();
+       next = MAX(now, MIN(now + interval, began + interval));
  
         return (next);
  }
@@ -3912,7 +4167,7 @@ out:
   * Free buffers that were tagged for destruction.
   */
  static void
-l2arc_do_free_on_write()
+l2arc_do_free_on_write(void)
  {
         list_t *buflist;
         l2arc_data_free_t *df, *df_prev;
@@ -4030,11 +4285,11 @@ l2arc_read_done(zio_t *zio)
         ASSERT(cb != NULL);
         buf = cb->l2rcb_buf;
         ASSERT(buf != NULL);
-       hdr = buf->b_hdr;
-       ASSERT(hdr != NULL);
  
-       hash_lock = HDR_LOCK(hdr);
+       hash_lock = HDR_LOCK(buf->b_hdr);
         mutex_enter(hash_lock);
+       hdr = buf->b_hdr;
+       ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
  
         /*
          * Check this survived the L2ARC journey.
@@ -4092,7 +4347,7 @@ l2arc_read_done(zio_t *zio)
  static list_t *
  l2arc_list_locked(int list_num, kmutex_t **lock)
  {
-       list_t *list;
+       list_t *list = NULL;
  
         ASSERT(list_num >= 0 && list_num <= 3);
  
@@ -4247,7 +4502,7 @@ top:
         }
         mutex_exit(&l2arc_buflist_mtx);
  
-       spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+       vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
         dev->l2ad_evict = taddr;
  }
  
@@ -4265,11 +4520,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
         list_t *list;
         uint64_t passed_sz, write_sz, buf_sz, headroom;
         void *buf_data;
-       kmutex_t *hash_lock, *list_lock;
+       kmutex_t *hash_lock, *list_lock = NULL;
         boolean_t have_lock, full;
         l2arc_write_callback_t *cb;
         zio_t *pio, *wzio;
-       uint64_t guid = spa_guid(spa);
+       uint64_t guid = spa_load_guid(spa);
+       int try;
  
         ASSERT(dev->l2ad_vdev != NULL);
  
@@ -4283,7 +4539,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
          * Copy buffers for L2ARC writing.
          */
         mutex_enter(&l2arc_buflist_mtx);
-       for (int try = 0; try <= 3; try++) {
+       for (try = 0; try <= 3; try++) {
                 list = l2arc_list_locked(try, &list_lock);
                 passed_sz = 0;
  
@@ -4342,8 +4598,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                                  */
                                 list_insert_head(dev->l2ad_buflist, head);
  
-                               cb = kmem_alloc(
-                                   sizeof (l2arc_write_callback_t), KM_SLEEP);
+                               cb = kmem_alloc(sizeof (l2arc_write_callback_t),
+                                               KM_PUSHPAGE);
                                 cb->l2wcb_dev = dev;
                                 cb->l2wcb_head = head;
                                 pio = zio_root(spa, l2arc_write_done, cb,
@@ -4353,7 +4609,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                         /*
                          * Create and add a new L2ARC header.
                          */
-                       hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+                       hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
+                                           KM_PUSHPAGE);
                         hdrl2->b_dev = dev;
                         hdrl2->b_daddr = dev->l2ad_hand;
  
@@ -4407,15 +4664,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
         ARCSTAT_BUMP(arcstat_l2_writes_sent);
         ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
         ARCSTAT_INCR(arcstat_l2_size, write_sz);
-       spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+       vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
  
         /*
          * Bump device hand to the device start if it is approaching the end.
          * l2arc_evict() will already have evicted ahead for this case.
          */
         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
-               spa_l2cache_space_update(dev->l2ad_vdev, 0,
-                   dev->l2ad_end - dev->l2ad_hand);
+               vdev_space_update(dev->l2ad_vdev,
+                   dev->l2ad_end - dev->l2ad_hand, 0, 0);
                 dev->l2ad_hand = dev->l2ad_start;
                 dev->l2ad_evict = dev->l2ad_start;
                 dev->l2ad_first = B_FALSE;
@@ -4439,7 +4696,7 @@ l2arc_feed_thread(void)
         l2arc_dev_t *dev;
         spa_t *spa;
         uint64_t size, wrote;
-       clock_t begin, next = lbolt;
+       clock_t begin, next = ddi_get_lbolt();
  
         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
  
@@ -4447,10 +4704,10 @@ l2arc_feed_thread(void)
  
         while (l2arc_thread_exit == 0) {
                 CALLB_CPR_SAFE_BEGIN(&cpr);
-               (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
-                   next);
+               (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
+                   &l2arc_feed_thr_lock, next);
                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
-               next = lbolt + hz;
+               next = ddi_get_lbolt() + hz;
  
                 /*
                  * Quick check for L2ARC devices.
@@ -4461,7 +4718,7 @@ l2arc_feed_thread(void)
                         continue;
                 }
                 mutex_exit(&l2arc_dev_mtx);
-               begin = lbolt;
+               begin = ddi_get_lbolt();
  
                 /*
                  * This selects the next l2arc device to write to, and in
@@ -4480,9 +4737,19 @@ l2arc_feed_thread(void)
                 ASSERT(spa != NULL);
  
                 /*
+                * If the pool is read-only then force the feed thread to
+                * sleep a little longer.
+                */
+               if (!spa_writeable(spa)) {
+                       next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+                       spa_config_exit(spa, SCL_L2ARC, dev);
+                       continue;
+               }
+
+               /*
                  * Avoid contributing to memory pressure.
                  */
-               if (arc_reclaim_needed()) {
+               if (arc_no_grow) {
                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
                         spa_config_exit(spa, SCL_L2ARC, dev);
                         continue;
@@ -4556,6 +4823,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
         adddev->l2ad_evict = adddev->l2ad_start;
         adddev->l2ad_first = B_TRUE;
         adddev->l2ad_writing = B_FALSE;
+       list_link_init(&adddev->l2ad_node);
         ASSERT3U(adddev->l2ad_write, >, 0);
  
         /*
@@ -4566,7 +4834,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
             offsetof(arc_buf_hdr_t, b_l2node));
  
-       spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+       vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
  
         /*
          * Add device to global list
@@ -4681,3 +4949,57 @@ l2arc_stop(void)
                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
         mutex_exit(&l2arc_feed_thr_lock);
  }
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(arc_read);
+EXPORT_SYMBOL(arc_buf_remove_ref);
+EXPORT_SYMBOL(arc_getbuf_func);
+EXPORT_SYMBOL(arc_add_prune_callback);
+EXPORT_SYMBOL(arc_remove_prune_callback);
+
+module_param(zfs_arc_min, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
+
+module_param(zfs_arc_max, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
+
+module_param(zfs_arc_meta_limit, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
+
+module_param(zfs_arc_meta_prune, int, 0444);
+MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
+
+module_param(zfs_arc_grow_retry, int, 0444);
+MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
+
+module_param(zfs_arc_shrink_shift, int, 0444);
+MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
+
+module_param(zfs_arc_p_min_shift, int, 0444);
+MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
+
+module_param(l2arc_write_max, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
+
+module_param(l2arc_write_boost, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
+
+module_param(l2arc_headroom, ulong, 0444);
+MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
+
+module_param(l2arc_feed_secs, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
+
+module_param(l2arc_feed_min_ms, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
+
+module_param(l2arc_noprefetch, int, 0444);
+MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
+
+module_param(l2arc_feed_again, int, 0444);
+MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
+
+module_param(l2arc_norw, int, 0444);
+MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
+
+#endif