Illumos #3805 arc shouldn't cache freed blocks
[zfs.git] / module / zfs / arc.c
index 829b4d9..2667725 100644 (file)
@@ -80,9 +80,9 @@
  * types of locks: 1) the hash table lock array, and 2) the
  * arc list locks.
  *
- * Buffers do not have their own mutexs, rather they rely on the
- * hash table mutexs for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexs).
+ * Buffers do not have their own mutexes, rather they rely on the
+ * hash table mutexes for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexes).
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
@@ -189,6 +189,8 @@ unsigned long zfs_arc_meta_limit = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
+int zfs_arc_memory_throttle_disable = 1;
+int zfs_disable_dup_eviction = 0;
 int zfs_arc_meta_prune = 0;
 
 /*
@@ -307,6 +309,9 @@ typedef struct arc_stats {
        kstat_named_t arcstat_l2_size;
        kstat_named_t arcstat_l2_hdr_size;
        kstat_named_t arcstat_memory_throttle_count;
+       kstat_named_t arcstat_duplicate_buffers;
+       kstat_named_t arcstat_duplicate_buffers_size;
+       kstat_named_t arcstat_duplicate_reads;
        kstat_named_t arcstat_memory_direct_count;
        kstat_named_t arcstat_memory_indirect_count;
        kstat_named_t arcstat_no_grow;
@@ -387,6 +392,9 @@ static arc_stats_t arc_stats = {
        { "l2_size",                    KSTAT_DATA_UINT64 },
        { "l2_hdr_size",                KSTAT_DATA_UINT64 },
        { "memory_throttle_count",      KSTAT_DATA_UINT64 },
+       { "duplicate_buffers",          KSTAT_DATA_UINT64 },
+       { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
+       { "duplicate_reads",            KSTAT_DATA_UINT64 },
        { "memory_direct_count",        KSTAT_DATA_UINT64 },
        { "memory_indirect_count",      KSTAT_DATA_UINT64 },
        { "arc_no_grow",                KSTAT_DATA_UINT64 },
@@ -1069,7 +1077,7 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
                ASSERT(list_link_active(&ab->b_arc_node));
                list_remove(list, ab);
                if (GHOST_STATE(ab->b_state)) {
-                       ASSERT3U(ab->b_datacnt, ==, 0);
+                       ASSERT0(ab->b_datacnt);
                        ASSERT3P(ab->b_buf, ==, NULL);
                        delta = ab->b_size;
                }
@@ -1369,6 +1377,17 @@ arc_buf_clone(arc_buf_t *from)
        hdr->b_buf = buf;
        arc_get_data_buf(buf);
        bcopy(from->b_data, buf->b_data, size);
+
+       /*
+        * This buffer already exists in the arc so create a duplicate
+        * copy for the caller.  If the buffer is associated with user data
+        * then track the size and number of duplicates.  These stats will be
+        * updated as duplicate buffers are created and destroyed.
+        */
+       if (hdr->b_type == ARC_BUFC_DATA) {
+               ARCSTAT_BUMP(arcstat_duplicate_buffers);
+               ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
+       }
        hdr->b_datacnt += 1;
        return (buf);
 }
@@ -1467,6 +1486,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
                ASSERT3U(state->arcs_size, >=, size);
                atomic_add_64(&state->arcs_size, -size);
                buf->b_data = NULL;
+
+               /*
+                * If we're destroying a duplicate buffer make sure
+                * that the appropriate statistics are updated.
+                */
+               if (buf->b_hdr->b_datacnt > 1 &&
+                   buf->b_hdr->b_type == ARC_BUFC_DATA) {
+                       ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+                       ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
+               }
                ASSERT(buf->b_hdr->b_datacnt > 0);
                buf->b_hdr->b_datacnt -= 1;
        }
@@ -1652,6 +1681,48 @@ arc_buf_size(arc_buf_t *buf)
 }
 
 /*
+ * Called from the DMU to determine if the current buffer should be
+ * evicted. In order to ensure proper locking, the eviction must be initiated
+ * from the DMU. Return true if the buffer is associated with user data and
+ * duplicate buffers still exist.
+ */
+boolean_t
+arc_buf_eviction_needed(arc_buf_t *buf)
+{
+       arc_buf_hdr_t *hdr;
+       boolean_t evict_needed = B_FALSE;
+
+       if (zfs_disable_dup_eviction)
+               return (B_FALSE);
+
+       mutex_enter(&buf->b_evict_lock);
+       hdr = buf->b_hdr;
+       if (hdr == NULL) {
+               /*
+                * We are in arc_do_user_evicts(); let that function
+                * perform the eviction.
+                */
+               ASSERT(buf->b_data == NULL);
+               mutex_exit(&buf->b_evict_lock);
+               return (B_FALSE);
+       } else if (buf->b_data == NULL) {
+               /*
+                * We have already been added to the arc eviction list;
+                * recommend eviction.
+                */
+               ASSERT3P(hdr, ==, &arc_eviction_hdr);
+               mutex_exit(&buf->b_evict_lock);
+               return (B_TRUE);
+       }
+
+       if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
+               evict_needed = B_TRUE;
+
+       mutex_exit(&buf->b_evict_lock);
+       return (evict_needed);
+}
+
+/*
  * Evict buffers from list until we've removed the specified number of
  * bytes.  Move the removed buffers to the appropriate evict state.
  * If the recycle flag is set, then attempt to "recycle" a buffer:
@@ -1701,7 +1772,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                hash_lock = HDR_LOCK(ab);
                have_lock = MUTEX_HELD(hash_lock);
                if (have_lock || mutex_tryenter(hash_lock)) {
-                       ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+                       ASSERT0(refcount_count(&ab->b_refcnt));
                        ASSERT(ab->b_datacnt > 0);
                        while (ab->b_buf) {
                                arc_buf_t *buf = ab->b_buf;
@@ -2414,18 +2485,6 @@ arc_evict_needed(arc_buf_contents_t type)
        if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
                return (1);
 
-#ifdef _KERNEL
-       /*
-        * If zio data pages are being allocated out of a separate heap segment,
-        * then enforce that the size of available vmem for this area remains
-        * above about 1/32nd free.
-        */
-       if (type == ARC_BUFC_DATA && zio_arena != NULL &&
-           vmem_size(zio_arena, VMEM_FREE) <
-           (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
-               return (1);
-#endif
-
        if (arc_no_grow)
                return (1);
 
@@ -2659,7 +2718,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                         * This is a prefetch access...
                         * move this block back to the MRU state.
                         */
-                       ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
+                       ASSERT0(refcount_count(&buf->b_refcnt));
                        new_state = arc_mru;
                }
 
@@ -2741,10 +2800,12 @@ arc_read_done(zio_t *zio)
        callback_list = hdr->b_acb;
        ASSERT(callback_list != NULL);
        if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
-               arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
-                   byteswap_uint64_array :
-                   dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
-               func(buf->b_data, hdr->b_size);
+               dmu_object_byteswap_t bswap =
+                   DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
+               if (BP_GET_LEVEL(zio->io_bp) > 0)
+                   byteswap_uint64_array(buf->b_data, hdr->b_size);
+               else
+                   dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
        }
 
        arc_cksum_compute(buf, B_FALSE);
@@ -2763,8 +2824,10 @@ arc_read_done(zio_t *zio)
        abuf = buf;
        for (acb = callback_list; acb; acb = acb->acb_next) {
                if (acb->acb_done) {
-                       if (abuf == NULL)
+                       if (abuf == NULL) {
+                               ARCSTAT_BUMP(arcstat_duplicate_reads);
                                abuf = arc_buf_clone(buf);
+                       }
                        acb->acb_buf = abuf;
                        abuf = NULL;
                }
@@ -2828,7 +2891,7 @@ arc_read_done(zio_t *zio)
 }
 
 /*
- * "Read" the block block at the specified DVA (in bp) via the
+ * "Read" the block at the specified DVA (in bp) via the
  * cache.  If the block is found in the cache, invoke the provided
  * callback immediately and return.  Note that the `zio' parameter
  * in the callback will be NULL in this case, since no IO was
@@ -2998,7 +3061,7 @@ top:
                        /* this block is in the ghost cache */
                        ASSERT(GHOST_STATE(hdr->b_state));
                        ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-                       ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
+                       ASSERT0(refcount_count(&hdr->b_refcnt));
                        ASSERT(hdr->b_buf == NULL);
 
                        /* if this is a prefetch, we don't have a reference */
@@ -3178,6 +3241,34 @@ arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
 }
 
 /*
+ * Notify the arc that a block was freed, and thus will never be used again.
+ */
+void
+arc_freed(spa_t *spa, const blkptr_t *bp)
+{
+       arc_buf_hdr_t *hdr;
+       kmutex_t *hash_lock;
+       uint64_t guid = spa_load_guid(spa);
+
+       hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+           &hash_lock);
+       if (hdr == NULL)
+               return;
+       if (HDR_BUF_AVAILABLE(hdr)) {
+               arc_buf_t *buf = hdr->b_buf;
+               add_reference(hdr, hash_lock, FTAG);
+               hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+               mutex_exit(hash_lock);
+
+               arc_release(buf, FTAG);
+               (void) arc_buf_remove_ref(buf, FTAG);
+       } else {
+               mutex_exit(hash_lock);
+       }
+
+}
+
+/*
  * This is used by the DMU to let the ARC know that a buffer is
  * being evicted, so the ARC should clean up.  If this arc buf
  * is not yet in the evicted state, it will be put there.
@@ -3334,6 +3425,16 @@ arc_release(arc_buf_t *buf, void *tag)
                        ASSERT3U(*size, >=, hdr->b_size);
                        atomic_add_64(size, -hdr->b_size);
                }
+
+               /*
+                * We're releasing a duplicate user data buffer, update
+                * our statistics accordingly.
+                */
+               if (hdr->b_type == ARC_BUFC_DATA) {
+                       ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+                       ARCSTAT_INCR(arcstat_duplicate_buffers_size,
+                           -hdr->b_size);
+               }
                hdr->b_datacnt -= 1;
                arc_cksum_verify(buf);
 
@@ -3561,12 +3662,11 @@ arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
 #ifdef _KERNEL
        uint64_t available_memory;
 
+       if (zfs_arc_memory_throttle_disable)
+               return (0);
+
        /* Easily reclaimable memory (free + inactive + arc-evictable) */
        available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();
-#if defined(__i386)
-       available_memory =
-           MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
-#endif
 
        if (available_memory <= zfs_write_limit_max) {
                ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
@@ -4651,7 +4751,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
        mutex_exit(&l2arc_buflist_mtx);
 
        if (pio == NULL) {
-               ASSERT3U(write_sz, ==, 0);
+               ASSERT0(write_sz);
                kmem_cache_free(hdr_cache, head);
                return (0);
        }
@@ -4974,6 +5074,12 @@ MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
 module_param(zfs_arc_p_min_shift, int, 0444);
 MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
 
+module_param(zfs_disable_dup_eviction, int, 0644);
+MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
+
+module_param(zfs_arc_memory_throttle_disable, int, 0644);
+MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
+
 module_param(l2arc_write_max, ulong, 0444);
 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");