Allow setting bootfs on any pool

[zfs.git] / module / zfs / arc.c
diff --git a/module/zfs/arc.c b/module/zfs/arc.c

index 7f1f747..95d14a9 100644 (file)
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -104,6 +104,14 @@
   * protected from simultaneous callbacks from arc_buf_evict()
   * and arc_do_user_evicts().
   *
+ * It as also possible to register a callback which is run when the
+ * arc_meta_limit is reached and no buffers can be safely evicted.  In
+ * this case the arc user should drop a reference on some arc buffers so
+ * they can be reclaimed and the arc_meta_limit honored.  For example,
+ * when using the ZPL each dentry holds a references on a znode.  These
+ * dentries must be pruned before the arc buffer holding the znode can
+ * be safely evicted.
+ *
   * Note that the majority of the performance stats are manipulated
   * with atomic operations.
   *
@@ -120,14 +128,13 @@
  #include <sys/zio.h>
  #include <sys/zfs_context.h>
  #include <sys/arc.h>
-#include <sys/refcount.h>
  #include <sys/vdev.h>
  #include <sys/vdev_impl.h>
  #ifdef _KERNEL
  #include <sys/vmsystm.h>
  #include <vm/anon.h>
  #include <sys/fs/swapnode.h>
-#include <sys/dnlc.h>
+#include <sys/zpl.h>
  #endif
  #include <sys/callb.h>
  #include <sys/kstat.h>
@@ -141,8 +148,8 @@ extern int zfs_write_limit_shift;
  extern uint64_t zfs_write_limit_max;
  extern kmutex_t zfs_write_limit_lock;
  
-#define        ARC_REDUCE_DNLC_PERCENT 3
-uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+/* number of bytes to prune from caches when at arc_meta_limit is reached */
+uint_t arc_meta_prune = 1048576;
  
  typedef enum arc_reclaim_strategy {
         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
@@ -180,7 +187,7 @@ unsigned long zfs_arc_meta_limit = 0;
  int zfs_arc_grow_retry = 0;
  int zfs_arc_shrink_shift = 0;
  int zfs_arc_p_min_shift = 0;
-int zfs_arc_reduce_dnlc_percent = 0;
+int zfs_arc_meta_prune = 0;
  
  /*
   * Note that buffers can be in one of 6 states:
@@ -288,6 +295,7 @@ typedef struct arc_stats {
         kstat_named_t arcstat_no_grow;
         kstat_named_t arcstat_tempreserve;
         kstat_named_t arcstat_loaned_bytes;
+       kstat_named_t arcstat_prune;
         kstat_named_t arcstat_meta_used;
         kstat_named_t arcstat_meta_limit;
         kstat_named_t arcstat_meta_max;
@@ -352,6 +360,7 @@ static arc_stats_t arc_stats = {
         { "arc_no_grow",                KSTAT_DATA_UINT64 },
         { "arc_tempreserve",            KSTAT_DATA_UINT64 },
         { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
+       { "arc_prune",                  KSTAT_DATA_UINT64 },
         { "arc_meta_used",              KSTAT_DATA_UINT64 },
         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
         { "arc_meta_max",               KSTAT_DATA_UINT64 },
@@ -481,6 +490,8 @@ struct arc_buf_hdr {
         list_node_t             b_l2node;
  };
  
+static list_t arc_prune_list;
+static kmutex_t arc_prune_mtx;
  static arc_buf_t *arc_eviction_list;
  static kmutex_t arc_eviction_mtx;
  static arc_buf_hdr_t arc_eviction_hdr;
@@ -582,14 +593,14 @@ uint64_t zfs_crc64_table[256];
  /*
   * L2ARC Performance Tunables
   */
-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;   /* default max write size */
-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
-uint64_t l2arc_headroom = L2ARC_HEADROOM;      /* number of dev writes */
-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;    /* interval seconds */
-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;        /* min interval milliseconds */
-boolean_t l2arc_noprefetch = B_TRUE;           /* don't cache prefetch bufs */
-boolean_t l2arc_feed_again = B_TRUE;           /* turbo warmup */
-boolean_t l2arc_norw = B_TRUE;                 /* no reads during writes */
+unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;      /* def max write size */
+unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;    /* extra warmup write */
+unsigned long l2arc_headroom = L2ARC_HEADROOM;         /* # of dev writes */
+unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;       /* interval seconds */
+unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;   /* min interval msecs */
+int l2arc_noprefetch = B_TRUE;                 /* don't cache prefetch bufs */
+int l2arc_feed_again = B_TRUE;                 /* turbo warmup */
+int l2arc_norw = B_TRUE;                       /* no reads during writes */
  
  /*
   * L2ARC Internals
@@ -1925,6 +1936,48 @@ arc_adjust(void)
         }
  }
  
+/*
+ * Request that arc user drop references so that N bytes can be released
+ * from the cache.  This provides a mechanism to ensure the arc can honor
+ * the arc_meta_limit and reclaim buffers which are pinned in the cache
+ * by higher layers.  (i.e. the zpl)
+ */
+static void
+arc_do_user_prune(int64_t adjustment)
+{
+       arc_prune_func_t *func;
+       void *private;
+       arc_prune_t *cp, *np;
+
+       mutex_enter(&arc_prune_mtx);
+
+       cp = list_head(&arc_prune_list);
+       while (cp != NULL) {
+               func = cp->p_pfunc;
+               private = cp->p_private;
+               np = list_next(&arc_prune_list, cp);
+               refcount_add(&cp->p_refcnt, func);
+               mutex_exit(&arc_prune_mtx);
+
+               if (func != NULL)
+                       func(adjustment, private);
+
+               mutex_enter(&arc_prune_mtx);
+
+               /* User removed prune callback concurrently with execution */
+               if (refcount_remove(&cp->p_refcnt, func) == 0) {
+                       ASSERT(!list_link_active(&cp->p_node));
+                       refcount_destroy(&cp->p_refcnt);
+                       kmem_free(cp, sizeof (*cp));
+               }
+
+               cp = np;
+       }
+
+       ARCSTAT_BUMP(arcstat_prune);
+       mutex_exit(&arc_prune_mtx);
+}
+
  static void
  arc_do_user_evicts(void)
  {
@@ -1949,6 +2002,32 @@ arc_do_user_evicts(void)
  }
  
  /*
+ * Evict only meta data objects from the cache leaving the data objects.
+ * This is only used to enforce the tunable arc_meta_limit, if we are
+ * unable to evict enough buffers notify the user via the prune callback.
+ */
+void
+arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
+{
+       int64_t delta;
+
+       if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+               delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+               arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
+               adjustment -= delta;
+       }
+
+       if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+               delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+               arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
+               adjustment -= delta;
+       }
+
+       if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
+               arc_do_user_prune(arc_meta_prune);
+}
+
+/*
   * Flush all *evictable* data from the cache for the given spa.
   * NOTE: this will not touch "active" (i.e. referenced) data.
   */
@@ -2085,24 +2164,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
         kmem_cache_t            *prev_data_cache = NULL;
         extern kmem_cache_t     *zio_buf_cache[];
         extern kmem_cache_t     *zio_data_buf_cache[];
-#ifdef _KERNEL
-       int                     retry = 0;
-
-       while ((arc_meta_used >= arc_meta_limit) && (retry < 10)) {
-               /*
-                * We are exceeding our meta-data cache limit.
-                * Purge some DNLC entries to release holds on meta-data.
-                */
-               dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
-               retry++;
-       }
-#if defined(__i386)
-       /*
-        * Reclaim unused memory from all kmem caches.
-        */
-       kmem_reap();
-#endif
-#endif
  
         /*
          * An aggressive reclamation will shrink the cache size as well as
@@ -2121,6 +2182,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
                         kmem_cache_reap_now(zio_data_buf_cache[i]);
                 }
         }
+
         kmem_cache_reap_now(buf_cache);
         kmem_cache_reap_now(hdr_cache);
  }
@@ -2131,6 +2193,7 @@ arc_reclaim_thread(void)
         clock_t                 growtime = 0;
         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
         callb_cpr_t             cpr;
+       int64_t                 prune;
  
         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
  
@@ -2160,9 +2223,14 @@ arc_reclaim_thread(void)
                         arc_no_grow = FALSE;
                 }
  
-               /* Keep meta data usage within limits */
-               if (arc_meta_used >= arc_meta_limit)
-                       arc_kmem_reap_now(ARC_RECLAIM_CONS);
+               /*
+                * Keep meta data usage within limits, arc_shrink() is not
+                * used to avoid collapsing the arc_c value when only the
+                * arc_meta_limit is being exceeded.
+                */
+               prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
+               if (prune > 0)
+                       arc_adjust_meta(prune, B_TRUE);
  
                 arc_adjust();
  
@@ -2399,16 +2467,27 @@ arc_get_data_buf(arc_buf_t *buf)
                 state =  (arc_mru->arcs_lsize[type] >= size &&
                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
         }
+
         if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
                 if (type == ARC_BUFC_METADATA) {
                         buf->b_data = zio_buf_alloc(size);
                         arc_space_consume(size, ARC_SPACE_DATA);
+
+                       /*
+                        * If we are unable to recycle an existing meta buffer
+                        * signal the reclaim thread.  It will notify users
+                        * via the prune callback to drop references.  The
+                        * prune callback in run in the context of the reclaim
+                        * thread to avoid deadlocking on the hash_lock.
+                        */
+                       cv_signal(&arc_reclaim_thr_cv);
                 } else {
                         ASSERT(type == ARC_BUFC_DATA);
                         buf->b_data = zio_data_buf_alloc(size);
                         ARCSTAT_INCR(arcstat_data_size, size);
                         atomic_add_64(&arc_size, size);
                 }
+
                 ARCSTAT_BUMP(arcstat_recycle_miss);
         }
         ASSERT(buf->b_data != NULL);
@@ -3021,6 +3100,37 @@ top:
         return (0);
  }
  
+arc_prune_t *
+arc_add_prune_callback(arc_prune_func_t *func, void *private)
+{
+       arc_prune_t *p;
+
+       p = kmem_alloc(sizeof(*p), KM_SLEEP);
+       p->p_pfunc = func;
+       p->p_private = private;
+       list_link_init(&p->p_node);
+       refcount_create(&p->p_refcnt);
+
+       mutex_enter(&arc_prune_mtx);
+       refcount_add(&p->p_refcnt, &arc_prune_list);
+       list_insert_head(&arc_prune_list, p);
+       mutex_exit(&arc_prune_mtx);
+
+       return (p);
+}
+
+void
+arc_remove_prune_callback(arc_prune_t *p)
+{
+       mutex_enter(&arc_prune_mtx);
+       list_remove(&arc_prune_list, p);
+       if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
+               refcount_destroy(&p->p_refcnt);
+               kmem_free(p, sizeof (*p));
+       }
+       mutex_exit(&arc_prune_mtx);
+}
+
  void
  arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
  {
@@ -3559,12 +3669,12 @@ arc_init(void)
  
         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
         arc_c_min = MAX(arc_c / 4, 64<<20);
-       /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
-       if (arc_c * 8 >= 1<<30)
-               arc_c_max = (arc_c * 8) - (1<<30);
+       /* set max to 1/2 of all memory, or all but 4GB, whichever is more */
+       if (arc_c * 8 >= ((uint64_t)4<<30))
+               arc_c_max = (arc_c * 8) - ((uint64_t)4<<30);
         else
                 arc_c_max = arc_c_min;
-       arc_c_max = MAX(arc_c * 6, arc_c_max);
+       arc_c_max = MAX(arc_c * 4, arc_c_max);
  
         /*
          * Allow the tunables to override our calculations if they are
@@ -3598,8 +3708,8 @@ arc_init(void)
         if (zfs_arc_p_min_shift > 0)
                 arc_p_min_shift = zfs_arc_p_min_shift;
  
-       if (zfs_arc_reduce_dnlc_percent > 0)
-               arc_reduce_dnlc_percent = zfs_arc_reduce_dnlc_percent;
+       if (zfs_arc_meta_prune > 0)
+               arc_meta_prune = zfs_arc_meta_prune;
  
         /* if kmem_flags are set, lets try to use less memory */
         if (kmem_debugging())
@@ -3646,7 +3756,10 @@ arc_init(void)
         buf_init();
  
         arc_thread_exit = 0;
+       list_create(&arc_prune_list, sizeof (arc_prune_t),
+           offsetof(arc_prune_t, p_node));
         arc_eviction_list = NULL;
+       mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
  
@@ -3674,6 +3787,8 @@ arc_init(void)
  void
  arc_fini(void)
  {
+       arc_prune_t *p;
+
         mutex_enter(&arc_reclaim_thr_lock);
  #ifdef _KERNEL
         spl_unregister_shrinker(&arc_shrinker);
@@ -3693,6 +3808,17 @@ arc_fini(void)
                 arc_ksp = NULL;
         }
  
+       mutex_enter(&arc_prune_mtx);
+       while ((p = list_head(&arc_prune_list)) != NULL) {
+               list_remove(&arc_prune_list, p);
+               refcount_remove(&p->p_refcnt, &arc_prune_list);
+               refcount_destroy(&p->p_refcnt);
+               kmem_free(p, sizeof (*p));
+       }
+       mutex_exit(&arc_prune_mtx);
+
+       list_destroy(&arc_prune_list);
+       mutex_destroy(&arc_prune_mtx);
         mutex_destroy(&arc_eviction_mtx);
         mutex_destroy(&arc_reclaim_thr_lock);
         cv_destroy(&arc_reclaim_thr_cv);
@@ -4774,6 +4900,8 @@ l2arc_stop(void)
  EXPORT_SYMBOL(arc_read);
  EXPORT_SYMBOL(arc_buf_remove_ref);
  EXPORT_SYMBOL(arc_getbuf_func);
+EXPORT_SYMBOL(arc_add_prune_callback);
+EXPORT_SYMBOL(arc_remove_prune_callback);
  
  module_param(zfs_arc_min, ulong, 0444);
  MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
@@ -4784,8 +4912,8 @@ MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
  module_param(zfs_arc_meta_limit, ulong, 0444);
  MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
  
-module_param(zfs_arc_reduce_dnlc_percent, int, 0444);
-MODULE_PARM_DESC(zfs_arc_reduce_dnlc_percent, "Meta reclaim percentage");
+module_param(zfs_arc_meta_prune, int, 0444);
+MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
  
  module_param(zfs_arc_grow_retry, int, 0444);
  MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
@@ -4796,4 +4924,28 @@ MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
  module_param(zfs_arc_p_min_shift, int, 0444);
  MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
  
+module_param(l2arc_write_max, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
+
+module_param(l2arc_write_boost, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
+
+module_param(l2arc_headroom, ulong, 0444);
+MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
+
+module_param(l2arc_feed_secs, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
+
+module_param(l2arc_feed_min_ms, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
+
+module_param(l2arc_noprefetch, int, 0444);
+MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
+
+module_param(l2arc_feed_again, int, 0444);
+MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
+
+module_param(l2arc_norw, int, 0444);
+MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
+
  #endif