Soft to hard tabs

[zfs.git] / module / zfs / arc.c
diff --git a/module/zfs/arc.c b/module/zfs/arc.c

index d6d648c..ff631e6 100644 (file)
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -174,12 +174,13 @@ static boolean_t arc_warm;
  /*
   * These tunables are for performance analysis.
   */
-uint64_t zfs_arc_max;
-uint64_t zfs_arc_min;
-uint64_t zfs_arc_meta_limit = 0;
+unsigned long zfs_arc_max = 0;
+unsigned long zfs_arc_min = 0;
+unsigned long zfs_arc_meta_limit = 0;
  int zfs_arc_grow_retry = 0;
  int zfs_arc_shrink_shift = 0;
  int zfs_arc_p_min_shift = 0;
+int zfs_arc_reduce_dnlc_percent = 0;
  
  /*
   * Note that buffers can be in one of 6 states:
@@ -282,6 +283,14 @@ typedef struct arc_stats {
         kstat_named_t arcstat_l2_size;
         kstat_named_t arcstat_l2_hdr_size;
         kstat_named_t arcstat_memory_throttle_count;
+       kstat_named_t arcstat_memory_direct_count;
+       kstat_named_t arcstat_memory_indirect_count;
+       kstat_named_t arcstat_no_grow;
+       kstat_named_t arcstat_tempreserve;
+       kstat_named_t arcstat_loaned_bytes;
+       kstat_named_t arcstat_meta_used;
+       kstat_named_t arcstat_meta_limit;
+       kstat_named_t arcstat_meta_max;
  } arc_stats_t;
  
  static arc_stats_t arc_stats = {
@@ -337,7 +346,15 @@ static arc_stats_t arc_stats = {
         { "l2_io_error",                KSTAT_DATA_UINT64 },
         { "l2_size",                    KSTAT_DATA_UINT64 },
         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
-       { "memory_throttle_count",      KSTAT_DATA_UINT64 }
+       { "memory_throttle_count",      KSTAT_DATA_UINT64 },
+       { "memory_direct_count",        KSTAT_DATA_UINT64 },
+       { "memory_indirect_count",      KSTAT_DATA_UINT64 },
+       { "arc_no_grow",                KSTAT_DATA_UINT64 },
+       { "arc_tempreserve",            KSTAT_DATA_UINT64 },
+       { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
+       { "arc_meta_used",              KSTAT_DATA_UINT64 },
+       { "arc_meta_limit",             KSTAT_DATA_UINT64 },
+       { "arc_meta_max",               KSTAT_DATA_UINT64 },
  };
  
  #define        ARCSTAT(stat)   (arc_stats.stat.value.ui64)
@@ -399,13 +416,12 @@ static arc_state_t        *arc_l2c_only;
  #define        arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
  #define        arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
  #define        arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
-
-static int             arc_no_grow;    /* Don't try to grow cache size */
-static uint64_t                arc_tempreserve;
-static uint64_t                arc_loaned_bytes;
-static uint64_t                arc_meta_used;
-static uint64_t                arc_meta_limit;
-static uint64_t                arc_meta_max = 0;
+#define        arc_no_grow     ARCSTAT(arcstat_no_grow)
+#define        arc_tempreserve ARCSTAT(arcstat_tempreserve)
+#define        arc_loaned_bytes        ARCSTAT(arcstat_loaned_bytes)
+#define        arc_meta_used   ARCSTAT(arcstat_meta_used)
+#define        arc_meta_limit  ARCSTAT(arcstat_meta_limit)
+#define        arc_meta_max    ARCSTAT(arcstat_meta_max)
  
  typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
  
@@ -523,12 +539,13 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
   * Hash table routines
   */
  
-#define        HT_LOCK_PAD     64
+#define        HT_LOCK_ALIGN   64
+#define        HT_LOCK_PAD     (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
  
  struct ht_lock {
         kmutex_t        ht_lock;
  #ifdef _KERNEL
-       unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+       unsigned char   pad[HT_LOCK_PAD];
  #endif
  };
  
@@ -565,14 +582,14 @@ uint64_t zfs_crc64_table[256];
  /*
   * L2ARC Performance Tunables
   */
-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;   /* default max write size */
-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
-uint64_t l2arc_headroom = L2ARC_HEADROOM;      /* number of dev writes */
-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;    /* interval seconds */
-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;        /* min interval milliseconds */
-boolean_t l2arc_noprefetch = B_TRUE;           /* don't cache prefetch bufs */
-boolean_t l2arc_feed_again = B_TRUE;           /* turbo warmup */
-boolean_t l2arc_norw = B_TRUE;                 /* no reads during writes */
+unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;      /* def max write size */
+unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;    /* extra warmup write */
+unsigned long l2arc_headroom = L2ARC_HEADROOM;         /* # of dev writes */
+unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;       /* interval seconds */
+unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;   /* min interval msecs */
+int l2arc_noprefetch = B_TRUE;                 /* don't cache prefetch bufs */
+int l2arc_feed_again = B_TRUE;                 /* turbo warmup */
+int l2arc_norw = B_TRUE;                       /* no reads during writes */
  
  /*
   * L2ARC Internals
@@ -772,8 +789,15 @@ buf_fini(void)
  {
         int i;
  
+#if defined(_KERNEL) && defined(HAVE_SPL)
+       /* Large allocations which do not require contiguous pages
+        * should be using vmem_free() in the linux kernel */
+       vmem_free(buf_hash_table.ht_table,
+           (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#else
         kmem_free(buf_hash_table.ht_table,
             (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#endif
         for (i = 0; i < BUF_LOCKS; i++)
                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
         kmem_cache_destroy(hdr_cache);
@@ -794,6 +818,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag)
         refcount_create(&buf->b_refcnt);
         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+       list_link_init(&buf->b_arc_node);
+       list_link_init(&buf->b_l2node);
         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
  
         return (0);
@@ -873,8 +899,15 @@ buf_init(void)
                 hsize <<= 1;
  retry:
         buf_hash_table.ht_mask = hsize - 1;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+       /* Large allocations which do not require contiguous pages
+        * should be using vmem_alloc() in the linux kernel */
+       buf_hash_table.ht_table =
+           vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+#else
         buf_hash_table.ht_table =
             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+#endif
         if (buf_hash_table.ht_table == NULL) {
                 ASSERT(hsize > (1ULL << 8));
                 hsize >>= 1;
@@ -1143,6 +1176,8 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
  
         switch (type) {
+       default:
+               break;
         case ARC_SPACE_DATA:
                 ARCSTAT_INCR(arcstat_data_size, space);
                 break;
@@ -1167,6 +1202,8 @@ arc_space_return(uint64_t space, arc_space_type_t type)
         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
  
         switch (type) {
+       default:
+               break;
         case ARC_SPACE_DATA:
                 ARCSTAT_INCR(arcstat_data_size, -space);
                 break;
@@ -1704,7 +1741,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
         mutex_exit(&state->arcs_mtx);
  
         if (bytes_evicted < bytes)
-               dprintf("only evicted %lld bytes from %x",
+               dprintf("only evicted %lld bytes from %x\n",
                     (longlong_t)bytes_evicted, state);
  
         if (skipped)
@@ -1820,7 +1857,7 @@ top:
         }
  
         if (bytes_deleted < bytes)
-               dprintf("only deleted %lld bytes from %p",
+               dprintf("only deleted %lld bytes from %p\n",
                     (longlong_t)bytes_deleted, state);
  }
  
@@ -1985,9 +2022,8 @@ arc_shrink(void)
  static int
  arc_reclaim_needed(void)
  {
-       uint64_t extra;
-
  #ifdef _KERNEL
+       uint64_t extra;
  
         if (needfree)
                 return (1);
@@ -2049,14 +2085,16 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
         kmem_cache_t            *prev_data_cache = NULL;
         extern kmem_cache_t     *zio_buf_cache[];
         extern kmem_cache_t     *zio_data_buf_cache[];
-
  #ifdef _KERNEL
-       if (arc_meta_used >= arc_meta_limit) {
+       int                     retry = 0;
+
+       while ((arc_meta_used >= arc_meta_limit) && (retry < 10)) {
                 /*
                  * We are exceeding our meta-data cache limit.
                  * Purge some DNLC entries to release holds on meta-data.
                  */
                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
+               retry++;
         }
  #if defined(__i386)
         /*
@@ -2122,6 +2160,10 @@ arc_reclaim_thread(void)
                         arc_no_grow = FALSE;
                 }
  
+               /* Keep meta data usage within limits */
+               if (arc_meta_used >= arc_meta_limit)
+                       arc_kmem_reap_now(ARC_RECLAIM_CONS);
+
                 arc_adjust();
  
                 if (arc_eviction_list != NULL)
@@ -2129,7 +2171,7 @@ arc_reclaim_thread(void)
  
                 /* block until needed, or one second, whichever is shorter */
                 CALLB_CPR_SAFE_BEGIN(&cpr);
-               (void) cv_timedwait(&arc_reclaim_thr_cv,
+               (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
         }
@@ -2140,6 +2182,59 @@ arc_reclaim_thread(void)
         thread_exit();
  }
  
+#ifdef _KERNEL
+/*
+ * Under Linux the arc shrinker may be called for synchronous (direct)
+ * reclaim, or asynchronous (indirect) reclaim.  When called by kswapd
+ * for indirect reclaim we take a conservative approach and just reap
+ * free slabs from the ARC caches.  If this proves to be insufficient
+ * direct reclaim will be trigger.  In direct reclaim a more aggressive
+ * strategy is used, data is evicted from the ARC and free slabs reaped.
+ */
+static int
+__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
+{
+       arc_reclaim_strategy_t strategy;
+       int arc_reclaim;
+
+       /* Return number of reclaimable pages based on arc_shrink_shift */
+       arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
+           >> arc_shrink_shift, 0);
+       if (sc->nr_to_scan == 0)
+               return (arc_reclaim);
+
+       /* Prevent reclaim below arc_c_min */
+       if (arc_reclaim <= 0)
+               return (-1);
+
+       /* Not allowed to perform filesystem reclaim */
+       if (!(sc->gfp_mask & __GFP_FS))
+               return (-1);
+
+       /* Reclaim in progress */
+       if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
+               return (-1);
+
+       if (current_is_kswapd()) {
+               strategy = ARC_RECLAIM_CONS;
+               ARCSTAT_INCR(arcstat_memory_indirect_count, 1);
+       } else {
+               strategy = ARC_RECLAIM_AGGR;
+               ARCSTAT_INCR(arcstat_memory_direct_count, 1);
+       }
+
+       arc_kmem_reap_now(strategy);
+       arc_reclaim = MAX(btop(((int64_t)arc_size - (int64_t)arc_c_min))
+           >> arc_shrink_shift, 0);
+       mutex_exit(&arc_reclaim_thr_lock);
+
+       return (arc_reclaim);
+}
+SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
+
+SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
+#endif /* _KERNEL */
+
  /*
   * Adapt arc info given the number of bytes we are trying to add and
   * the state that we are comming from.  This function is only called
@@ -2675,7 +2770,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
      uint32_t *arc_flags, const zbookmark_t *zb)
  {
         arc_buf_hdr_t *hdr;
-       arc_buf_t *buf;
+       arc_buf_t *buf = NULL;
         kmutex_t *hash_lock;
         zio_t *rzio;
         uint64_t guid = spa_guid(spa);
@@ -2700,7 +2795,7 @@ top:
                                 arc_callback_t  *acb = NULL;
  
                                 acb = kmem_zalloc(sizeof (arc_callback_t),
-                                   KM_SLEEP);
+                                   KM_PUSHPAGE);
                                 acb->acb_done = done;
                                 acb->acb_private = private;
                                 if (pio != NULL)
@@ -2757,7 +2852,7 @@ top:
                 uint64_t size = BP_GET_LSIZE(bp);
                 arc_callback_t  *acb;
                 vdev_t *vd = NULL;
-               uint64_t addr;
+               uint64_t addr = -1;
                 boolean_t devw = B_FALSE;
  
                 if (hdr == NULL) {
@@ -2816,7 +2911,7 @@ top:
  
                 ASSERT(!GHOST_STATE(hdr->b_state));
  
-               acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+               acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE);
                 acb->acb_done = done;
                 acb->acb_private = private;
  
@@ -2865,7 +2960,7 @@ top:
                                 ARCSTAT_BUMP(arcstat_l2_hits);
  
                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
-                                   KM_SLEEP);
+                                   KM_PUSHPAGE);
                                 cb->l2rcb_buf = buf;
                                 cb->l2rcb_spa = spa;
                                 cb->l2rcb_bp = *bp;
@@ -3035,7 +3130,7 @@ arc_release(arc_buf_t *buf, void *tag)
         arc_buf_hdr_t *hdr;
         kmutex_t *hash_lock = NULL;
         l2arc_buf_hdr_t *l2hdr;
-       uint64_t buf_size;
+       uint64_t buf_size = 0;
  
         /*
          * It would be nice to assert that if it's DMU metadata (level >
@@ -3410,7 +3505,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
          * in order to compress/encrypt/etc the data.  We therefor need to
          * make sure that there is sufficient available memory for this.
          */
-       if (error = arc_memory_throttle(reserve, anon_size, txg))
+       if ((error = arc_memory_throttle(reserve, anon_size, txg)))
                 return (error);
  
         /*
@@ -3454,6 +3549,12 @@ arc_init(void)
          * need to limit the cache to 1/8 of VM size.
          */
         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+       /*
+        * Register a shrinker to support synchronous (direct) memory
+        * reclaim from the arc.  This is done to prevent kswapd from
+        * swapping out pages when it is preferable to shrink the arc.
+        */
+       spl_register_shrinker(&arc_shrinker);
  #endif
  
         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
@@ -3479,6 +3580,7 @@ arc_init(void)
  
         /* limit meta-data to 1/4 of the arc capacity */
         arc_meta_limit = arc_c_max / 4;
+       arc_meta_max = 0;
  
         /* Allow the tunable to override if it is reasonable */
         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
@@ -3496,6 +3598,9 @@ arc_init(void)
         if (zfs_arc_p_min_shift > 0)
                 arc_p_min_shift = zfs_arc_p_min_shift;
  
+       if (zfs_arc_reduce_dnlc_percent > 0)
+               arc_reduce_dnlc_percent = zfs_arc_reduce_dnlc_percent;
+
         /* if kmem_flags are set, lets try to use less memory */
         if (kmem_debugging())
                 arc_c = arc_c / 2;
@@ -3570,6 +3675,10 @@ void
  arc_fini(void)
  {
         mutex_enter(&arc_reclaim_thr_lock);
+#ifdef _KERNEL
+       spl_unregister_shrinker(&arc_shrinker);
+#endif /* _KERNEL */
+
         arc_thread_exit = 1;
         while (arc_thread_exit != 0)
                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
@@ -4059,7 +4168,7 @@ l2arc_read_done(zio_t *zio)
  static list_t *
  l2arc_list_locked(int list_num, kmutex_t **lock)
  {
-       list_t *list;
+       list_t *list = NULL;
  
         ASSERT(list_num >= 0 && list_num <= 3);
  
@@ -4232,7 +4341,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
         list_t *list;
         uint64_t passed_sz, write_sz, buf_sz, headroom;
         void *buf_data;
-       kmutex_t *hash_lock, *list_lock;
+       kmutex_t *hash_lock, *list_lock = NULL;
         boolean_t have_lock, full;
         l2arc_write_callback_t *cb;
         zio_t *pio, *wzio;
@@ -4415,8 +4524,8 @@ l2arc_feed_thread(void)
  
         while (l2arc_thread_exit == 0) {
                 CALLB_CPR_SAFE_BEGIN(&cpr);
-               (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
-                   next);
+               (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
+                   &l2arc_feed_thr_lock, next);
                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
                 next = ddi_get_lbolt() + hz;
  
@@ -4534,6 +4643,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
         adddev->l2ad_evict = adddev->l2ad_start;
         adddev->l2ad_first = B_TRUE;
         adddev->l2ad_writing = B_FALSE;
+       list_link_init(&adddev->l2ad_node);
         ASSERT3U(adddev->l2ad_write, >, 0);
  
         /*
@@ -4659,3 +4769,55 @@ l2arc_stop(void)
                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
         mutex_exit(&l2arc_feed_thr_lock);
  }
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(arc_read);
+EXPORT_SYMBOL(arc_buf_remove_ref);
+EXPORT_SYMBOL(arc_getbuf_func);
+
+module_param(zfs_arc_min, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
+
+module_param(zfs_arc_max, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
+
+module_param(zfs_arc_meta_limit, ulong, 0444);
+MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
+
+module_param(zfs_arc_reduce_dnlc_percent, int, 0444);
+MODULE_PARM_DESC(zfs_arc_reduce_dnlc_percent, "Meta reclaim percentage");
+
+module_param(zfs_arc_grow_retry, int, 0444);
+MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
+
+module_param(zfs_arc_shrink_shift, int, 0444);
+MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
+
+module_param(zfs_arc_p_min_shift, int, 0444);
+MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
+
+module_param(l2arc_write_max, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
+
+module_param(l2arc_write_boost, ulong, 0444);
+MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
+
+module_param(l2arc_headroom, ulong, 0444);
+MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
+
+module_param(l2arc_feed_secs, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
+
+module_param(l2arc_feed_min_ms, ulong, 0444);
+MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
+
+module_param(l2arc_noprefetch, int, 0444);
+MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
+
+module_param(l2arc_feed_again, int, 0444);
+MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
+
+module_param(l2arc_norw, int, 0444);
+MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
+
+#endif