Fix using zvol as slog device

[zfs.git] / module / zfs / arc.c
diff --git a/module/zfs/arc.c b/module/zfs/arc.c

index 7f1f747..6ec9f04 100644 (file)
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -20,6 +20,8 @@
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
   */
  
  /*
@@ -104,6 +106,14 @@
   * protected from simultaneous callbacks from arc_buf_evict()
   * and arc_do_user_evicts().
   *
+ * It as also possible to register a callback which is run when the
+ * arc_meta_limit is reached and no buffers can be safely evicted.  In
+ * this case the arc user should drop a reference on some arc buffers so
+ * they can be reclaimed and the arc_meta_limit honored.  For example,
+ * when using the ZPL each dentry holds a references on a znode.  These
+ * dentries must be pruned before the arc buffer holding the znode can
+ * be safely evicted.
+ *
   * Note that the majority of the performance stats are manipulated
   * with atomic operations.
   *
@@ -120,29 +130,25 @@
  #include <sys/zio.h>
  #include <sys/zfs_context.h>
  #include <sys/arc.h>
-#include <sys/refcount.h>
  #include <sys/vdev.h>
  #include <sys/vdev_impl.h>
  #ifdef _KERNEL
  #include <sys/vmsystm.h>
  #include <vm/anon.h>
  #include <sys/fs/swapnode.h>
-#include <sys/dnlc.h>
+#include <sys/zpl.h>
  #endif
  #include <sys/callb.h>
  #include <sys/kstat.h>
+#include <sys/dmu_tx.h>
  #include <zfs_fletcher.h>
  
  static kmutex_t                arc_reclaim_thr_lock;
  static kcondvar_t      arc_reclaim_thr_cv;     /* used to signal reclaim thr */
  static uint8_t         arc_thread_exit;
  
-extern int zfs_write_limit_shift;
-extern uint64_t zfs_write_limit_max;
-extern kmutex_t zfs_write_limit_lock;
-
-#define        ARC_REDUCE_DNLC_PERCENT 3
-uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+/* number of bytes to prune from caches when at arc_meta_limit is reached */
+uint_t arc_meta_prune = 1048576;
  
  typedef enum arc_reclaim_strategy {
         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
@@ -150,7 +156,10 @@ typedef enum arc_reclaim_strategy {
  } arc_reclaim_strategy_t;
  
  /* number of seconds before growing cache again */
-static int             arc_grow_retry = 60;
+static int             arc_grow_retry = 5;
+
+/* expiration time for arc_no_grow */
+static clock_t         arc_grow_time = 0;
  
  /* shift of arc_c for calculating both min and max arc_p */
  static int             arc_p_min_shift = 4;
@@ -180,7 +189,7 @@ unsigned long zfs_arc_meta_limit = 0;
  int zfs_arc_grow_retry = 0;
  int zfs_arc_shrink_shift = 0;
  int zfs_arc_p_min_shift = 0;
-int zfs_arc_reduce_dnlc_percent = 0;
+int zfs_arc_meta_prune = 0;
  
  /*
   * Note that buffers can be in one of 6 states:
@@ -264,6 +273,21 @@ typedef struct arc_stats {
         kstat_named_t arcstat_hdr_size;
         kstat_named_t arcstat_data_size;
         kstat_named_t arcstat_other_size;
+       kstat_named_t arcstat_anon_size;
+       kstat_named_t arcstat_anon_evict_data;
+       kstat_named_t arcstat_anon_evict_metadata;
+       kstat_named_t arcstat_mru_size;
+       kstat_named_t arcstat_mru_evict_data;
+       kstat_named_t arcstat_mru_evict_metadata;
+       kstat_named_t arcstat_mru_ghost_size;
+       kstat_named_t arcstat_mru_ghost_evict_data;
+       kstat_named_t arcstat_mru_ghost_evict_metadata;
+       kstat_named_t arcstat_mfu_size;
+       kstat_named_t arcstat_mfu_evict_data;
+       kstat_named_t arcstat_mfu_evict_metadata;
+       kstat_named_t arcstat_mfu_ghost_size;
+       kstat_named_t arcstat_mfu_ghost_evict_data;
+       kstat_named_t arcstat_mfu_ghost_evict_metadata;
         kstat_named_t arcstat_l2_hits;
         kstat_named_t arcstat_l2_misses;
         kstat_named_t arcstat_l2_feeds;
@@ -288,6 +312,7 @@ typedef struct arc_stats {
         kstat_named_t arcstat_no_grow;
         kstat_named_t arcstat_tempreserve;
         kstat_named_t arcstat_loaned_bytes;
+       kstat_named_t arcstat_prune;
         kstat_named_t arcstat_meta_used;
         kstat_named_t arcstat_meta_limit;
         kstat_named_t arcstat_meta_max;
@@ -328,6 +353,21 @@ static arc_stats_t arc_stats = {
         { "hdr_size",                   KSTAT_DATA_UINT64 },
         { "data_size",                  KSTAT_DATA_UINT64 },
         { "other_size",                 KSTAT_DATA_UINT64 },
+       { "anon_size",                  KSTAT_DATA_UINT64 },
+       { "anon_evict_data",            KSTAT_DATA_UINT64 },
+       { "anon_evict_metadata",        KSTAT_DATA_UINT64 },
+       { "mru_size",                   KSTAT_DATA_UINT64 },
+       { "mru_evict_data",             KSTAT_DATA_UINT64 },
+       { "mru_evict_metadata",         KSTAT_DATA_UINT64 },
+       { "mru_ghost_size",             KSTAT_DATA_UINT64 },
+       { "mru_ghost_evict_data",       KSTAT_DATA_UINT64 },
+       { "mru_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
+       { "mfu_size",                   KSTAT_DATA_UINT64 },
+       { "mfu_evict_data",             KSTAT_DATA_UINT64 },
+       { "mfu_evict_metadata",         KSTAT_DATA_UINT64 },
+       { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
+       { "mfu_ghost_evict_data",       KSTAT_DATA_UINT64 },
+       { "mfu_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
         { "l2_hits",                    KSTAT_DATA_UINT64 },
         { "l2_misses",                  KSTAT_DATA_UINT64 },
         { "l2_feeds",                   KSTAT_DATA_UINT64 },
@@ -352,6 +392,7 @@ static arc_stats_t arc_stats = {
         { "arc_no_grow",                KSTAT_DATA_UINT64 },
         { "arc_tempreserve",            KSTAT_DATA_UINT64 },
         { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
+       { "arc_prune",                  KSTAT_DATA_UINT64 },
         { "arc_meta_used",              KSTAT_DATA_UINT64 },
         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
         { "arc_meta_max",               KSTAT_DATA_UINT64 },
@@ -481,6 +522,8 @@ struct arc_buf_hdr {
         list_node_t             b_l2node;
  };
  
+static list_t arc_prune_list;
+static kmutex_t arc_prune_mtx;
  static arc_buf_t *arc_eviction_list;
  static kmutex_t arc_eviction_mtx;
  static arc_buf_hdr_t arc_eviction_hdr;
@@ -582,14 +625,14 @@ uint64_t zfs_crc64_table[256];
  /*
   * L2ARC Performance Tunables
   */
-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;   /* default max write size */
-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
-uint64_t l2arc_headroom = L2ARC_HEADROOM;      /* number of dev writes */
-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;    /* interval seconds */
-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;        /* min interval milliseconds */
-boolean_t l2arc_noprefetch = B_TRUE;           /* don't cache prefetch bufs */
-boolean_t l2arc_feed_again = B_TRUE;           /* turbo warmup */
-boolean_t l2arc_norw = B_TRUE;                 /* no reads during writes */
+unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;      /* def max write size */
+unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;    /* extra warmup write */
+unsigned long l2arc_headroom = L2ARC_HEADROOM;         /* # of dev writes */
+unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;       /* interval seconds */
+unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;   /* min interval msecs */
+int l2arc_noprefetch = B_TRUE;                 /* don't cache prefetch bufs */
+int l2arc_feed_again = B_TRUE;                 /* turbo warmup */
+int l2arc_norw = B_TRUE;                       /* no reads during writes */
  
  /*
   * L2ARC Internals
@@ -867,22 +910,6 @@ buf_dest(void *vbuf, void *unused)
         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
  }
  
-/*
- * Reclaim callback -- invoked when memory is low.
- */
-/* ARGSUSED */
-static void
-hdr_recl(void *unused)
-{
-       dprintf("hdr_recl called\n");
-       /*
-        * umem calls the reclaim func when we destroy the buf cache,
-        * which is after we do arc_fini().
-        */
-       if (!arc_dead)
-               cv_signal(&arc_reclaim_thr_cv);
-}
-
  static void
  buf_init(void)
  {
@@ -915,7 +942,7 @@ retry:
         }
  
         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
-           0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+           0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0);
         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
  
@@ -976,7 +1003,8 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
                 mutex_exit(&buf->b_hdr->b_freeze_lock);
                 return;
         }
-       buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+       buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+                                               KM_PUSHPAGE);
         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
             buf->b_hdr->b_freeze_cksum);
         mutex_exit(&buf->b_hdr->b_freeze_lock);
@@ -1254,7 +1282,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
         ASSERT(BUF_EMPTY(hdr));
         hdr->b_size = size;
         hdr->b_type = type;
-       hdr->b_spa = spa_guid(spa);
+       hdr->b_spa = spa_load_guid(spa);
         hdr->b_state = arc_anon;
         hdr->b_arc_access = 0;
         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
@@ -1388,7 +1416,7 @@ arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
  {
         if (HDR_L2_WRITING(hdr)) {
                 l2arc_data_free_t *df;
-               df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+               df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE);
                 df->l2df_data = data;
                 df->l2df_size = size;
                 df->l2df_func = free_func;
@@ -1925,6 +1953,48 @@ arc_adjust(void)
         }
  }
  
+/*
+ * Request that arc user drop references so that N bytes can be released
+ * from the cache.  This provides a mechanism to ensure the arc can honor
+ * the arc_meta_limit and reclaim buffers which are pinned in the cache
+ * by higher layers.  (i.e. the zpl)
+ */
+static void
+arc_do_user_prune(int64_t adjustment)
+{
+       arc_prune_func_t *func;
+       void *private;
+       arc_prune_t *cp, *np;
+
+       mutex_enter(&arc_prune_mtx);
+
+       cp = list_head(&arc_prune_list);
+       while (cp != NULL) {
+               func = cp->p_pfunc;
+               private = cp->p_private;
+               np = list_next(&arc_prune_list, cp);
+               refcount_add(&cp->p_refcnt, func);
+               mutex_exit(&arc_prune_mtx);
+
+               if (func != NULL)
+                       func(adjustment, private);
+
+               mutex_enter(&arc_prune_mtx);
+
+               /* User removed prune callback concurrently with execution */
+               if (refcount_remove(&cp->p_refcnt, func) == 0) {
+                       ASSERT(!list_link_active(&cp->p_node));
+                       refcount_destroy(&cp->p_refcnt);
+                       kmem_free(cp, sizeof (*cp));
+               }
+
+               cp = np;
+       }
+
+       ARCSTAT_BUMP(arcstat_prune);
+       mutex_exit(&arc_prune_mtx);
+}
+
  static void
  arc_do_user_evicts(void)
  {
@@ -1949,6 +2019,32 @@ arc_do_user_evicts(void)
  }
  
  /*
+ * Evict only meta data objects from the cache leaving the data objects.
+ * This is only used to enforce the tunable arc_meta_limit, if we are
+ * unable to evict enough buffers notify the user via the prune callback.
+ */
+void
+arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
+{
+       int64_t delta;
+
+       if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+               delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+               arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
+               adjustment -= delta;
+       }
+
+       if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+               delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+               arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
+               adjustment -= delta;
+       }
+
+       if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
+               arc_do_user_prune(arc_meta_prune);
+}
+
+/*
   * Flush all *evictable* data from the cache for the given spa.
   * NOTE: this will not touch "active" (i.e. referenced) data.
   */
@@ -1958,7 +2054,7 @@ arc_flush(spa_t *spa)
         uint64_t guid = 0;
  
         if (spa)
-               guid = spa_guid(spa);
+               guid = spa_load_guid(spa);
  
         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
@@ -1991,16 +2087,13 @@ arc_flush(spa_t *spa)
  }
  
  void
-arc_shrink(void)
+arc_shrink(uint64_t bytes)
  {
         if (arc_c > arc_c_min) {
                 uint64_t to_free;
  
-#ifdef _KERNEL
-               to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
-#else
-               to_free = arc_c >> arc_shrink_shift;
-#endif
+               to_free = bytes ? bytes : arc_c >> arc_shrink_shift;
+
                 if (arc_c > arc_c_min + to_free)
                         atomic_add_64(&arc_c, -to_free);
                 else
@@ -2019,97 +2112,21 @@ arc_shrink(void)
                 arc_adjust();
  }
  
-static int
-arc_reclaim_needed(void)
-{
-#ifdef _KERNEL
-       uint64_t extra;
-
-       if (needfree)
-               return (1);
-
-       /*
-        * take 'desfree' extra pages, so we reclaim sooner, rather than later
-        */
-       extra = desfree;
-
-       /*
-        * check that we're out of range of the pageout scanner.  It starts to
-        * schedule paging if freemem is less than lotsfree and needfree.
-        * lotsfree is the high-water mark for pageout, and needfree is the
-        * number of needed free pages.  We add extra pages here to make sure
-        * the scanner doesn't start up while we're freeing memory.
-        */
-       if (freemem < lotsfree + needfree + extra)
-               return (1);
-
-       /*
-        * check to make sure that swapfs has enough space so that anon
-        * reservations can still succeed. anon_resvmem() checks that the
-        * availrmem is greater than swapfs_minfree, and the number of reserved
-        * swap pages.  We also add a bit of extra here just to prevent
-        * circumstances from getting really dire.
-        */
-       if (availrmem < swapfs_minfree + swapfs_reserve + extra)
-               return (1);
-
-#if defined(__i386)
-       /*
-        * If we're on an i386 platform, it's possible that we'll exhaust the
-        * kernel heap space before we ever run out of available physical
-        * memory.  Most checks of the size of the heap_area compare against
-        * tune.t_minarmem, which is the minimum available real memory that we
-        * can have in the system.  However, this is generally fixed at 25 pages
-        * which is so low that it's useless.  In this comparison, we seek to
-        * calculate the total heap-size, and reclaim if more than 3/4ths of the
-        * heap is allocated.  (Or, in the calculation, if less than 1/4th is
-        * free)
-        */
-       if (btop(vmem_size(heap_arena, VMEM_FREE)) <
-           (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
-               return (1);
-#endif
-
-#else
-       if (spa_get_random(100) == 0)
-               return (1);
-#endif
-       return (0);
-}
-
  static void
-arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
  {
         size_t                  i;
         kmem_cache_t            *prev_cache = NULL;
         kmem_cache_t            *prev_data_cache = NULL;
         extern kmem_cache_t     *zio_buf_cache[];
         extern kmem_cache_t     *zio_data_buf_cache[];
-#ifdef _KERNEL
-       int                     retry = 0;
-
-       while ((arc_meta_used >= arc_meta_limit) && (retry < 10)) {
-               /*
-                * We are exceeding our meta-data cache limit.
-                * Purge some DNLC entries to release holds on meta-data.
-                */
-               dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
-               retry++;
-       }
-#if defined(__i386)
-       /*
-        * Reclaim unused memory from all kmem caches.
-        */
-       kmem_reap();
-#endif
-#endif
  
         /*
          * An aggressive reclamation will shrink the cache size as well as
          * reap free buffers from the arc kmem caches.
          */
         if (strat == ARC_RECLAIM_AGGR)
-               arc_shrink();
+               arc_shrink(bytes);
  
         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
                 if (zio_buf_cache[i] != prev_cache) {
@@ -2121,22 +2138,32 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
                         kmem_cache_reap_now(zio_data_buf_cache[i]);
                 }
         }
+
         kmem_cache_reap_now(buf_cache);
         kmem_cache_reap_now(hdr_cache);
  }
  
+/*
+ * Unlike other ZFS implementations this thread is only responsible for
+ * adapting the target ARC size on Linux.  The responsibility for memory
+ * reclamation has been entirely delegated to the arc_shrinker_func()
+ * which is registered with the VM.  To reflect this change in behavior
+ * the arc_reclaim thread has been renamed to arc_adapt.
+ */
  static void
-arc_reclaim_thread(void)
+arc_adapt_thread(void)
  {
-       clock_t                 growtime = 0;
-       arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
         callb_cpr_t             cpr;
+       int64_t                 prune;
  
         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
  
         mutex_enter(&arc_reclaim_thr_lock);
         while (arc_thread_exit == 0) {
-               if (arc_reclaim_needed()) {
+#ifndef _KERNEL
+               arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
+
+               if (spa_get_random(100) == 0) {
  
                         if (arc_no_grow) {
                                 if (last_reclaim == ARC_RECLAIM_CONS) {
@@ -2151,18 +2178,25 @@ arc_reclaim_thread(void)
                         }
  
                         /* reset the growth delay for every reclaim */
-                       growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
+                       arc_grow_time = ddi_get_lbolt()+(arc_grow_retry * hz);
  
-                       arc_kmem_reap_now(last_reclaim);
+                       arc_kmem_reap_now(last_reclaim, 0);
                         arc_warm = B_TRUE;
+               }
+#endif /* !_KERNEL */
  
-               } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
+               /* No recent memory pressure allow the ARC to grow. */
+               if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
                         arc_no_grow = FALSE;
-               }
  
-               /* Keep meta data usage within limits */
-               if (arc_meta_used >= arc_meta_limit)
-                       arc_kmem_reap_now(ARC_RECLAIM_CONS);
+               /*
+                * Keep meta data usage within limits, arc_shrink() is not
+                * used to avoid collapsing the arc_c value when only the
+                * arc_meta_limit is being exceeded.
+                */
+               prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
+               if (prune > 0)
+                       arc_adjust_meta(prune, B_TRUE);
  
                 arc_adjust();
  
@@ -2184,28 +2218,83 @@ arc_reclaim_thread(void)
  
  #ifdef _KERNEL
  /*
- * Under Linux the arc shrinker may be called for synchronous (direct)
- * reclaim, or asynchronous (indirect) reclaim.  When called by kswapd
- * for indirect reclaim we take a conservative approach and just reap
- * free slabs from the ARC caches.  If this proves to be insufficient
- * direct reclaim will be trigger.  In direct reclaim a more aggressive
- * strategy is used, data is evicted from the ARC and free slabs reaped.
+ * Determine the amount of memory eligible for eviction contained in the
+ * ARC. All clean data reported by the ghost lists can always be safely
+ * evicted. Due to arc_c_min, the same does not hold for all clean data
+ * contained by the regular mru and mfu lists.
+ *
+ * In the case of the regular mru and mfu lists, we need to report as
+ * much clean data as possible, such that evicting that same reported
+ * data will not bring arc_size below arc_c_min. Thus, in certain
+ * circumstances, the total amount of clean data in the mru and mfu
+ * lists might not actually be evictable.
+ *
+ * The following two distinct cases are accounted for:
+ *
+ * 1. The sum of the amount of dirty data contained by both the mru and
+ *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ *    is greater than or equal to arc_c_min.
+ *    (i.e. amount of dirty data >= arc_c_min)
+ *
+ *    This is the easy case; all clean data contained by the mru and mfu
+ *    lists is evictable. Evicting all clean data can only drop arc_size
+ *    to the amount of dirty data, which is greater than arc_c_min.
+ *
+ * 2. The sum of the amount of dirty data contained by both the mru and
+ *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ *    is less than arc_c_min.
+ *    (i.e. arc_c_min > amount of dirty data)
+ *
+ *    2.1. arc_size is greater than or equal arc_c_min.
+ *         (i.e. arc_size >= arc_c_min > amount of dirty data)
+ *
+ *         In this case, not all clean data from the regular mru and mfu
+ *         lists is actually evictable; we must leave enough clean data
+ *         to keep arc_size above arc_c_min. Thus, the maximum amount of
+ *         evictable data from the two lists combined, is exactly the
+ *         difference between arc_size and arc_c_min.
+ *
+ *    2.2. arc_size is less than arc_c_min
+ *         (i.e. arc_c_min > arc_size > amount of dirty data)
+ *
+ *         In this case, none of the data contained in the mru and mfu
+ *         lists is evictable, even if it's clean. Since arc_size is
+ *         already below arc_c_min, evicting any more would only
+ *         increase this negative difference.
   */
+static uint64_t
+arc_evictable_memory(void) {
+       uint64_t arc_clean =
+           arc_mru->arcs_lsize[ARC_BUFC_DATA] +
+           arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+           arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
+           arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
+       uint64_t ghost_clean =
+           arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
+           arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
+           arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
+           arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
+       uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
+
+       if (arc_dirty >= arc_c_min)
+               return (ghost_clean + arc_clean);
+
+       return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
+}
+
  static int
  __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
  {
-       arc_reclaim_strategy_t strategy;
-       int arc_reclaim;
+       uint64_t pages;
  
-       /* Return number of reclaimable pages based on arc_shrink_shift */
-       arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
-           >> arc_shrink_shift, 0);
-       if (sc->nr_to_scan == 0)
-               return (arc_reclaim);
+       /* The arc is considered warm once reclaim has occurred */
+       if (unlikely(arc_warm == B_FALSE))
+               arc_warm = B_TRUE;
  
-       /* Prevent reclaim below arc_c_min */
-       if (arc_reclaim <= 0)
-               return (-1);
+       /* Return the potential number of reclaimable pages */
+       pages = btop(arc_evictable_memory());
+       if (sc->nr_to_scan == 0)
+               return (pages);
  
         /* Not allowed to perform filesystem reclaim */
         if (!(sc->gfp_mask & __GFP_FS))
@@ -2215,20 +2304,37 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
         if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
                 return (-1);
  
+       /*
+        * Evict the requested number of pages by shrinking arc_c the
+        * requested amount.  If there is nothing left to evict just
+        * reap whatever we can from the various arc slabs.
+        */
+       if (pages > 0) {
+               arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
+               pages = btop(arc_evictable_memory());
+       } else {
+               arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
+               pages = -1;
+       }
+
+       /*
+        * When direct reclaim is observed it usually indicates a rapid
+        * increase in memory pressure.  This occurs because the kswapd
+        * threads were unable to asynchronously keep enough free memory
+        * available.  In this case set arc_no_grow to briefly pause arc
+        * growth to avoid compounding the memory pressure.
+        */
         if (current_is_kswapd()) {
-               strategy = ARC_RECLAIM_CONS;
-               ARCSTAT_INCR(arcstat_memory_indirect_count, 1);
+               ARCSTAT_BUMP(arcstat_memory_indirect_count);
         } else {
-               strategy = ARC_RECLAIM_AGGR;
-               ARCSTAT_INCR(arcstat_memory_direct_count, 1);
+               arc_no_grow = B_TRUE;
+               arc_grow_time = ddi_get_lbolt() + (arc_grow_retry * hz);
+               ARCSTAT_BUMP(arcstat_memory_direct_count);
         }
  
-       arc_kmem_reap_now(strategy);
-       arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
-           >> arc_shrink_shift, 0);
         mutex_exit(&arc_reclaim_thr_lock);
  
-       return (arc_reclaim);
+       return (pages);
  }
  SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
  
@@ -2276,11 +2382,6 @@ arc_adapt(int bytes, arc_state_t *state)
         }
         ASSERT((int64_t)arc_p >= 0);
  
-       if (arc_reclaim_needed()) {
-               cv_signal(&arc_reclaim_thr_cv);
-               return;
-       }
-
         if (arc_no_grow)
                 return;
  
@@ -2313,19 +2414,7 @@ arc_evict_needed(arc_buf_contents_t type)
         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
                 return (1);
  
-#ifdef _KERNEL
-       /*
-        * If zio data pages are being allocated out of a separate heap segment,
-        * then enforce that the size of available vmem for this area remains
-        * above about 1/32nd free.
-        */
-       if (type == ARC_BUFC_DATA && zio_arena != NULL &&
-           vmem_size(zio_arena, VMEM_FREE) <
-           (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
-               return (1);
-#endif
-
-       if (arc_reclaim_needed())
+       if (arc_no_grow)
                 return (1);
  
         return (arc_size > arc_c);
@@ -2399,16 +2488,27 @@ arc_get_data_buf(arc_buf_t *buf)
                 state =  (arc_mru->arcs_lsize[type] >= size &&
                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
         }
+
         if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
                 if (type == ARC_BUFC_METADATA) {
                         buf->b_data = zio_buf_alloc(size);
                         arc_space_consume(size, ARC_SPACE_DATA);
+
+                       /*
+                        * If we are unable to recycle an existing meta buffer
+                        * signal the reclaim thread.  It will notify users
+                        * via the prune callback to drop references.  The
+                        * prune callback in run in the context of the reclaim
+                        * thread to avoid deadlocking on the hash_lock.
+                        */
+                       cv_signal(&arc_reclaim_thr_cv);
                 } else {
                         ASSERT(type == ARC_BUFC_DATA);
                         buf->b_data = zio_data_buf_alloc(size);
                         ARCSTAT_INCR(arcstat_data_size, size);
                         atomic_add_64(&arc_size, size);
                 }
+
                 ARCSTAT_BUMP(arcstat_recycle_miss);
         }
         ASSERT(buf->b_data != NULL);
@@ -2773,7 +2873,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
         arc_buf_t *buf = NULL;
         kmutex_t *hash_lock;
         zio_t *rzio;
-       uint64_t guid = spa_guid(spa);
+       uint64_t guid = spa_load_guid(spa);
  
  top:
         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
@@ -3021,6 +3121,37 @@ top:
         return (0);
  }
  
+arc_prune_t *
+arc_add_prune_callback(arc_prune_func_t *func, void *private)
+{
+       arc_prune_t *p;
+
+       p = kmem_alloc(sizeof(*p), KM_SLEEP);
+       p->p_pfunc = func;
+       p->p_private = private;
+       list_link_init(&p->p_node);
+       refcount_create(&p->p_refcnt);
+
+       mutex_enter(&arc_prune_mtx);
+       refcount_add(&p->p_refcnt, &arc_prune_list);
+       list_insert_head(&arc_prune_list, p);
+       mutex_exit(&arc_prune_mtx);
+
+       return (p);
+}
+
+void
+arc_remove_prune_callback(arc_prune_t *p)
+{
+       mutex_enter(&arc_prune_mtx);
+       list_remove(&arc_prune_list, p);
+       if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
+               refcount_destroy(&p->p_refcnt);
+               kmem_free(p, sizeof (*p));
+       }
+       mutex_exit(&arc_prune_mtx);
+}
+
  void
  arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
  {
@@ -3400,7 +3531,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
         ASSERT(hdr->b_acb == NULL);
         if (l2arc)
                 hdr->b_flags |= ARC_L2CACHE;
-       callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+       callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE);
         callback->awcb_ready = ready;
         callback->awcb_done = done;
         callback->awcb_private = private;
@@ -3416,50 +3547,20 @@ static int
  arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
  {
  #ifdef _KERNEL
-       uint64_t available_memory = ptob(freemem);
-       static uint64_t page_load = 0;
-       static uint64_t last_txg = 0;
+       uint64_t available_memory;
  
-#if defined(__i386)
-       available_memory =
-           MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
-#endif
-       if (available_memory >= zfs_write_limit_max)
-               return (0);
+       /* Easily reclaimable memory (free + inactive + arc-evictable) */
+       available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();
  
-       if (txg > last_txg) {
-               last_txg = txg;
-               page_load = 0;
-       }
-       /*
-        * If we are in pageout, we know that memory is already tight,
-        * the arc is already going to be evicting, so we just want to
-        * continue to let page writes occur as quickly as possible.
-        */
-       if (curproc == proc_pageout) {
-               if (page_load > MAX(ptob(minfree), available_memory) / 4)
-                       return (ERESTART);
-               /* Note: reserve is inflated, so we deflate */
-               page_load += reserve / 8;
-               return (0);
-       } else if (page_load > 0 && arc_reclaim_needed()) {
-               /* memory is low, delay before restarting */
+       if (available_memory <= zfs_write_limit_max) {
                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+               DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
                 return (EAGAIN);
         }
-       page_load = 0;
-
-       if (arc_size > arc_c_min) {
-               uint64_t evictable_memory =
-                   arc_mru->arcs_lsize[ARC_BUFC_DATA] +
-                   arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
-                   arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
-                   arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
-               available_memory += MIN(evictable_memory, arc_size - arc_c_min);
-       }
  
         if (inflight_data > available_memory / 4) {
                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+               DMU_TX_STAT_BUMP(dmu_tx_memory_inflight);
                 return (ERESTART);
         }
  #endif
@@ -3490,8 +3591,10 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
  #endif
         if (reserve > arc_c/4 && !arc_no_grow)
                 arc_c = MIN(arc_c_max, reserve * 4);
-       if (reserve > arc_c)
+       if (reserve > arc_c) {
+               DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
                 return (ENOMEM);
+       }
  
         /*
          * Don't count loaned bufs as in flight dirty data to prevent long
@@ -3524,12 +3627,55 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
                     reserve>>10, arc_c>>10);
+               DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
                 return (ERESTART);
         }
         atomic_add_64(&arc_tempreserve, reserve);
         return (0);
  }
  
+static void
+arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
+    kstat_named_t *evict_data, kstat_named_t *evict_metadata)
+{
+       size->value.ui64 = state->arcs_size;
+       evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
+       evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
+}
+
+static int
+arc_kstat_update(kstat_t *ksp, int rw)
+{
+       arc_stats_t *as = ksp->ks_data;
+
+       if (rw == KSTAT_WRITE) {
+               return (EACCES);
+       } else {
+               arc_kstat_update_state(arc_anon,
+                   &as->arcstat_anon_size,
+                   &as->arcstat_anon_evict_data,
+                   &as->arcstat_anon_evict_metadata);
+               arc_kstat_update_state(arc_mru,
+                   &as->arcstat_mru_size,
+                   &as->arcstat_mru_evict_data,
+                   &as->arcstat_mru_evict_metadata);
+               arc_kstat_update_state(arc_mru_ghost,
+                   &as->arcstat_mru_ghost_size,
+                   &as->arcstat_mru_ghost_evict_data,
+                   &as->arcstat_mru_ghost_evict_metadata);
+               arc_kstat_update_state(arc_mfu,
+                   &as->arcstat_mfu_size,
+                   &as->arcstat_mfu_evict_data,
+                   &as->arcstat_mfu_evict_metadata);
+               arc_kstat_update_state(arc_mfu_ghost,
+                   &as->arcstat_mfu_ghost_size,
+                   &as->arcstat_mfu_ghost_evict_data,
+                   &as->arcstat_mfu_ghost_evict_metadata);
+       }
+
+       return (0);
+}
+
  void
  arc_init(void)
  {
@@ -3559,12 +3705,8 @@ arc_init(void)
  
         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
         arc_c_min = MAX(arc_c / 4, 64<<20);
-       /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
-       if (arc_c * 8 >= 1<<30)
-               arc_c_max = (arc_c * 8) - (1<<30);
-       else
-               arc_c_max = arc_c_min;
-       arc_c_max = MAX(arc_c * 6, arc_c_max);
+       /* set max to 1/2 of all memory */
+       arc_c_max = MAX(arc_c * 4, arc_c_max);
  
         /*
          * Allow the tunables to override our calculations if they are
@@ -3598,8 +3740,8 @@ arc_init(void)
         if (zfs_arc_p_min_shift > 0)
                 arc_p_min_shift = zfs_arc_p_min_shift;
  
-       if (zfs_arc_reduce_dnlc_percent > 0)
-               arc_reduce_dnlc_percent = zfs_arc_reduce_dnlc_percent;
+       if (zfs_arc_meta_prune > 0)
+               arc_meta_prune = zfs_arc_meta_prune;
  
         /* if kmem_flags are set, lets try to use less memory */
         if (kmem_debugging())
@@ -3646,7 +3788,10 @@ arc_init(void)
         buf_init();
  
         arc_thread_exit = 0;
+       list_create(&arc_prune_list, sizeof (arc_prune_t),
+           offsetof(arc_prune_t, p_node));
         arc_eviction_list = NULL;
+       mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
  
@@ -3655,10 +3800,11 @@ arc_init(void)
  
         if (arc_ksp != NULL) {
                 arc_ksp->ks_data = &arc_stats;
+               arc_ksp->ks_update = arc_kstat_update;
                 kstat_install(arc_ksp);
         }
  
-       (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+       (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
             TS_RUN, minclsyspri);
  
         arc_dead = FALSE;
@@ -3674,6 +3820,8 @@ arc_init(void)
  void
  arc_fini(void)
  {
+       arc_prune_t *p;
+
         mutex_enter(&arc_reclaim_thr_lock);
  #ifdef _KERNEL
         spl_unregister_shrinker(&arc_shrinker);
@@ -3693,6 +3841,17 @@ arc_fini(void)
                 arc_ksp = NULL;
         }
  
+       mutex_enter(&arc_prune_mtx);
+       while ((p = list_head(&arc_prune_list)) != NULL) {
+               list_remove(&arc_prune_list, p);
+               refcount_remove(&p->p_refcnt, &arc_prune_list);
+               refcount_destroy(&p->p_refcnt);
+               kmem_free(p, sizeof (*p));
+       }
+       mutex_exit(&arc_prune_mtx);
+
+       list_destroy(&arc_prune_list);
+       mutex_destroy(&arc_prune_mtx);
         mutex_destroy(&arc_eviction_mtx);
         mutex_destroy(&arc_reclaim_thr_lock);
         cv_destroy(&arc_reclaim_thr_cv);
@@ -4345,7 +4504,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
         boolean_t have_lock, full;
         l2arc_write_callback_t *cb;
         zio_t *pio, *wzio;
-       uint64_t guid = spa_guid(spa);
+       uint64_t guid = spa_load_guid(spa);
         int try;
  
         ASSERT(dev->l2ad_vdev != NULL);
@@ -4419,8 +4578,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                                  */
                                 list_insert_head(dev->l2ad_buflist, head);
  
-                               cb = kmem_alloc(
-                                   sizeof (l2arc_write_callback_t), KM_SLEEP);
+                               cb = kmem_alloc(sizeof (l2arc_write_callback_t),
+                                               KM_PUSHPAGE);
                                 cb->l2wcb_dev = dev;
                                 cb->l2wcb_head = head;
                                 pio = zio_root(spa, l2arc_write_done, cb,
@@ -4430,7 +4589,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                         /*
                          * Create and add a new L2ARC header.
                          */
-                       hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+                       hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
+                                           KM_PUSHPAGE);
                         hdrl2->b_dev = dev;
                         hdrl2->b_daddr = dev->l2ad_hand;
  
@@ -4569,7 +4729,7 @@ l2arc_feed_thread(void)
                 /*
                  * Avoid contributing to memory pressure.
                  */
-               if (arc_reclaim_needed()) {
+               if (arc_no_grow) {
                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
                         spa_config_exit(spa, SCL_L2ARC, dev);
                         continue;
@@ -4774,6 +4934,8 @@ l2arc_stop(void)
  EXPORT_SYMBOL(arc_read);
  EXPORT_SYMBOL(arc_buf_remove_ref);
  EXPORT_SYMBOL(arc_getbuf_func);
+EXPORT_SYMBOL(arc_add_prune_callback);
+EXPORT_SYMBOL(arc_remove_prune_callback);
  
  module_param(zfs_arc_min, ulong, 0444);
  MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
@@ -4784,8 +4946,8 @@ MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
  module_param(zfs_arc_meta_limit, ulong, 0444);
  MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
  
-module_param(zfs_arc_reduce_dnlc_percent, int, 0444);
-MODULE_PARM_DESC(zfs_arc_reduce_dnlc_percent, "Meta reclaim percentage");
+module_param(zfs_arc_meta_prune, int, 0444);
+MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
  
  module_param(zfs_arc_grow_retry, int, 0444);
  MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
@@ -4796,4 +4958,28 @@ MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
  module_param(zfs_arc_p_min_shift, int, 0444);
  MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
  
+module_param(l2arc_write_max, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
+
+module_param(l2arc_write_boost, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
+
+module_param(l2arc_headroom, ulong, 0444);
+MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
+
+module_param(l2arc_feed_secs, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
+
+module_param(l2arc_feed_min_ms, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
+
+module_param(l2arc_noprefetch, int, 0444);
+MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
+
+module_param(l2arc_feed_again, int, 0444);
+MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
+
+module_param(l2arc_norw, int, 0444);
+MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
+
  #endif