Prototype/structure update for Linux
[zfs.git] / module / zfs / arc.c
index 8adb54d..808c8e8 100644 (file)
@@ -174,9 +174,9 @@ static boolean_t arc_warm;
 /*
  * These tunables are for performance analysis.
  */
-uint64_t zfs_arc_max;
-uint64_t zfs_arc_min;
-uint64_t zfs_arc_meta_limit = 0;
+unsigned long zfs_arc_max = 0;
+unsigned long zfs_arc_min = 0;
+unsigned long zfs_arc_meta_limit = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
@@ -523,12 +523,13 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
  * Hash table routines
  */
 
-#define        HT_LOCK_PAD     64
+#define        HT_LOCK_ALIGN   64
+#define        HT_LOCK_PAD     (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
 
 struct ht_lock {
        kmutex_t        ht_lock;
 #ifdef _KERNEL
-       unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+       unsigned char   pad[HT_LOCK_PAD];
 #endif
 };
 
@@ -772,8 +773,15 @@ buf_fini(void)
 {
        int i;
 
+#if defined(_KERNEL) && defined(HAVE_SPL)
+       /* Large allocations which do not require contiguous pages
+        * should be using vmem_free() in the linux kernel */
+       vmem_free(buf_hash_table.ht_table,
+           (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#else
        kmem_free(buf_hash_table.ht_table,
            (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#endif
        for (i = 0; i < BUF_LOCKS; i++)
                mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
        kmem_cache_destroy(hdr_cache);
@@ -794,6 +802,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag)
        refcount_create(&buf->b_refcnt);
        cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
        mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+       list_link_init(&buf->b_arc_node);
+       list_link_init(&buf->b_l2node);
        arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 
        return (0);
@@ -873,8 +883,15 @@ buf_init(void)
                hsize <<= 1;
 retry:
        buf_hash_table.ht_mask = hsize - 1;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+       /* Large allocations which do not require contiguous pages
+        * should be using vmem_alloc() in the linux kernel */
+       buf_hash_table.ht_table =
+           vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+#else
        buf_hash_table.ht_table =
            kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+#endif
        if (buf_hash_table.ht_table == NULL) {
                ASSERT(hsize > (1ULL << 8));
                hsize >>= 1;
@@ -952,11 +969,6 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
-       kmutex_t *hash_lock;
-
-       hash_lock = HDR_LOCK(buf->b_hdr);
-       mutex_enter(hash_lock);
-
        if (zfs_flags & ZFS_DEBUG_MODIFY) {
                if (buf->b_hdr->b_state != arc_anon)
                        panic("modifying non-anon buffer!");
@@ -978,7 +990,6 @@ arc_buf_thaw(arc_buf_t *buf)
        }
 
        mutex_exit(&buf->b_hdr->b_freeze_lock);
-       mutex_exit(hash_lock);
 }
 
 void
@@ -1149,6 +1160,8 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
        ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
        switch (type) {
+       default:
+               break;
        case ARC_SPACE_DATA:
                ARCSTAT_INCR(arcstat_data_size, space);
                break;
@@ -1173,6 +1186,8 @@ arc_space_return(uint64_t space, arc_space_type_t type)
        ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
        switch (type) {
+       default:
+               break;
        case ARC_SPACE_DATA:
                ARCSTAT_INCR(arcstat_data_size, -space);
                break;
@@ -1432,10 +1447,11 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
+       l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+
        ASSERT(refcount_is_zero(&hdr->b_refcnt));
        ASSERT3P(hdr->b_state, ==, arc_anon);
        ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-       l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
 
        if (l2hdr != NULL) {
                boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
@@ -1709,7 +1725,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
        mutex_exit(&state->arcs_mtx);
 
        if (bytes_evicted < bytes)
-               dprintf("only evicted %lld bytes from %x",
+               dprintf("only evicted %lld bytes from %x\n",
                    (longlong_t)bytes_evicted, state);
 
        if (skipped)
@@ -1730,12 +1746,12 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
                        int64_t todelete =
                            MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-                       arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+                       arc_evict_ghost(arc_mru_ghost, 0, todelete);
                } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
                        int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
                            arc_mru_ghost->arcs_size +
                            arc_mfu_ghost->arcs_size - arc_c);
-                       arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+                       arc_evict_ghost(arc_mfu_ghost, 0, todelete);
                }
        }
 
@@ -1750,18 +1766,25 @@ static void
 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
 {
        arc_buf_hdr_t *ab, *ab_prev;
+       arc_buf_hdr_t marker;
        list_t *list = &state->arcs_list[ARC_BUFC_DATA];
        kmutex_t *hash_lock;
        uint64_t bytes_deleted = 0;
        uint64_t bufs_skipped = 0;
 
        ASSERT(GHOST_STATE(state));
+       bzero(&marker, sizeof(marker));
 top:
        mutex_enter(&state->arcs_mtx);
        for (ab = list_tail(list); ab; ab = ab_prev) {
                ab_prev = list_prev(list, ab);
                if (spa && ab->b_spa != spa)
                        continue;
+
+               /* ignore markers */
+               if (ab->b_spa == 0)
+                       continue;
+
                hash_lock = HDR_LOCK(ab);
                /* caller may be trying to modify this buffer, skip it */
                if (MUTEX_HELD(hash_lock))
@@ -1788,15 +1811,21 @@ top:
                        DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
                        if (bytes >= 0 && bytes_deleted >= bytes)
                                break;
-               } else {
-                       if (bytes < 0) {
-                               mutex_exit(&state->arcs_mtx);
-                               mutex_enter(hash_lock);
-                               mutex_exit(hash_lock);
-                               goto top;
-                       }
+               } else if (bytes < 0) {
+                       /*
+                        * Insert a list marker and then wait for the
+                        * hash lock to become available. Once its
+                        * available, restart from where we left off.
+                        */
+                       list_insert_after(list, ab, &marker);
+                       mutex_exit(&state->arcs_mtx);
+                       mutex_enter(hash_lock);
+                       mutex_exit(hash_lock);
+                       mutex_enter(&state->arcs_mtx);
+                       ab_prev = list_prev(list, &marker);
+                       list_remove(list, &marker);
+               } else
                        bufs_skipped += 1;
-               }
        }
        mutex_exit(&state->arcs_mtx);
 
@@ -1812,7 +1841,7 @@ top:
        }
 
        if (bytes_deleted < bytes)
-               dprintf("only deleted %lld bytes from %p",
+               dprintf("only deleted %lld bytes from %p\n",
                    (longlong_t)bytes_deleted, state);
 }
 
@@ -1825,18 +1854,19 @@ arc_adjust(void)
         * Adjust MRU size
         */
 
-       adjustment = MIN(arc_size - arc_c,
-           arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
+       adjustment = MIN((int64_t)(arc_size - arc_c),
+           (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
+           arc_p));
 
        if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
                delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
-               (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
+               (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
                adjustment -= delta;
        }
 
        if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
                delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
-               (void) arc_evict(arc_mru, NULL, delta, FALSE,
+               (void) arc_evict(arc_mru, 0, delta, FALSE,
                    ARC_BUFC_METADATA);
        }
 
@@ -1848,14 +1878,14 @@ arc_adjust(void)
 
        if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
                delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
-               (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
+               (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
                adjustment -= delta;
        }
 
        if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
                int64_t delta = MIN(adjustment,
                    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
-               (void) arc_evict(arc_mfu, NULL, delta, FALSE,
+               (void) arc_evict(arc_mfu, 0, delta, FALSE,
                    ARC_BUFC_METADATA);
        }
 
@@ -1867,7 +1897,7 @@ arc_adjust(void)
 
        if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
                delta = MIN(arc_mru_ghost->arcs_size, adjustment);
-               arc_evict_ghost(arc_mru_ghost, NULL, delta);
+               arc_evict_ghost(arc_mru_ghost, 0, delta);
        }
 
        adjustment =
@@ -1875,7 +1905,7 @@ arc_adjust(void)
 
        if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
                delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
-               arc_evict_ghost(arc_mfu_ghost, NULL, delta);
+               arc_evict_ghost(arc_mfu_ghost, 0, delta);
        }
 }
 
@@ -1976,9 +2006,8 @@ arc_shrink(void)
 static int
 arc_reclaim_needed(void)
 {
-       uint64_t extra;
-
 #ifdef _KERNEL
+       uint64_t extra;
 
        if (needfree)
                return (1);
@@ -2113,16 +2142,14 @@ arc_reclaim_thread(void)
                        arc_no_grow = FALSE;
                }
 
-               if (2 * arc_c < arc_size +
-                   arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
-                       arc_adjust();
+               arc_adjust();
 
                if (arc_eviction_list != NULL)
                        arc_do_user_evicts();
 
                /* block until needed, or one second, whichever is shorter */
                CALLB_CPR_SAFE_BEGIN(&cpr);
-               (void) cv_timedwait(&arc_reclaim_thr_cv,
+               (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
                    &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
                CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
        }
@@ -2159,6 +2186,7 @@ arc_adapt(int bytes, arc_state_t *state)
        if (state == arc_mru_ghost) {
                mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
                    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+               mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
 
                arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
        } else if (state == arc_mfu_ghost) {
@@ -2166,6 +2194,7 @@ arc_adapt(int bytes, arc_state_t *state)
 
                mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
                    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+               mult = MIN(mult, 10);
 
                delta = MIN(bytes * mult, arc_p);
                arc_p = MAX(arc_p_min, arc_p - delta);
@@ -2295,7 +2324,7 @@ arc_get_data_buf(arc_buf_t *buf)
                state =  (arc_mru->arcs_lsize[type] >= size &&
                    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
        }
-       if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
+       if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
                if (type == ARC_BUFC_METADATA) {
                        buf->b_data = zio_buf_alloc(size);
                        arc_space_consume(size, ARC_SPACE_DATA);
@@ -2666,7 +2695,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     uint32_t *arc_flags, const zbookmark_t *zb)
 {
        arc_buf_hdr_t *hdr;
-       arc_buf_t *buf;
+       arc_buf_t *buf = NULL;
        kmutex_t *hash_lock;
        zio_t *rzio;
        uint64_t guid = spa_guid(spa);
@@ -2748,7 +2777,7 @@ top:
                uint64_t size = BP_GET_LSIZE(bp);
                arc_callback_t  *acb;
                vdev_t *vd = NULL;
-               uint64_t addr;
+               uint64_t addr = -1;
                boolean_t devw = B_FALSE;
 
                if (hdr == NULL) {
@@ -3026,7 +3055,7 @@ arc_release(arc_buf_t *buf, void *tag)
        arc_buf_hdr_t *hdr;
        kmutex_t *hash_lock = NULL;
        l2arc_buf_hdr_t *l2hdr;
-       uint64_t buf_size;
+       uint64_t buf_size = 0;
 
        /*
         * It would be nice to assert that if it's DMU metadata (level >
@@ -3401,7 +3430,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
         * in order to compress/encrypt/etc the data.  We therefor need to
         * make sure that there is sufficient available memory for this.
         */
-       if (error = arc_memory_throttle(reserve, anon_size, txg))
+       if ((error = arc_memory_throttle(reserve, anon_size, txg)))
                return (error);
 
        /*
@@ -3870,7 +3899,7 @@ out:
  * Free buffers that were tagged for destruction.
  */
 static void
-l2arc_do_free_on_write()
+l2arc_do_free_on_write(void)
 {
        list_t *buflist;
        l2arc_data_free_t *df, *df_prev;
@@ -4050,7 +4079,7 @@ l2arc_read_done(zio_t *zio)
 static list_t *
 l2arc_list_locked(int list_num, kmutex_t **lock)
 {
-       list_t *list;
+       list_t *list = NULL;
 
        ASSERT(list_num >= 0 && list_num <= 3);
 
@@ -4223,11 +4252,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
        list_t *list;
        uint64_t passed_sz, write_sz, buf_sz, headroom;
        void *buf_data;
-       kmutex_t *hash_lock, *list_lock;
+       kmutex_t *hash_lock, *list_lock = NULL;
        boolean_t have_lock, full;
        l2arc_write_callback_t *cb;
        zio_t *pio, *wzio;
        uint64_t guid = spa_guid(spa);
+       int try;
 
        ASSERT(dev->l2ad_vdev != NULL);
 
@@ -4241,7 +4271,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
         * Copy buffers for L2ARC writing.
         */
        mutex_enter(&l2arc_buflist_mtx);
-       for (int try = 0; try <= 3; try++) {
+       for (try = 0; try <= 3; try++) {
                list = l2arc_list_locked(try, &list_lock);
                passed_sz = 0;
 
@@ -4405,8 +4435,8 @@ l2arc_feed_thread(void)
 
        while (l2arc_thread_exit == 0) {
                CALLB_CPR_SAFE_BEGIN(&cpr);
-               (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
-                   next);
+               (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
+                   &l2arc_feed_thr_lock, next);
                CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
                next = ddi_get_lbolt() + hz;
 
@@ -4438,6 +4468,16 @@ l2arc_feed_thread(void)
                ASSERT(spa != NULL);
 
                /*
+                * If the pool is read-only then force the feed thread to
+                * sleep a little longer.
+                */
+               if (!spa_writeable(spa)) {
+                       next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+                       spa_config_exit(spa, SCL_L2ARC, dev);
+                       continue;
+               }
+
+               /*
                 * Avoid contributing to memory pressure.
                 */
                if (arc_reclaim_needed()) {
@@ -4514,6 +4554,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
        adddev->l2ad_evict = adddev->l2ad_start;
        adddev->l2ad_first = B_TRUE;
        adddev->l2ad_writing = B_FALSE;
+       list_link_init(&adddev->l2ad_node);
        ASSERT3U(adddev->l2ad_write, >, 0);
 
        /*
@@ -4639,3 +4680,18 @@ l2arc_stop(void)
                cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
        mutex_exit(&l2arc_feed_thr_lock);
 }
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(arc_read);
+EXPORT_SYMBOL(arc_buf_remove_ref);
+EXPORT_SYMBOL(arc_getbuf_func);
+
+module_param(zfs_arc_min, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_min, "Minimum arc size");
+
+module_param(zfs_arc_max, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_max, "Maximum arc size");
+
+module_param(zfs_arc_meta_limit, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
+#endif