module/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 /*
  26  * DVA-based Adjustable Replacement Cache
  27  *
  28  * While much of the theory of operation used here is
  29  * based on the self-tuning, low overhead replacement cache
  30  * presented by Megiddo and Modha at FAST 2003, there are some
  31  * significant differences:
  32  *
  33  * 1. The Megiddo and Modha model assumes any page is evictable.
  34  * Pages in its cache cannot be "locked" into memory.  This makes
  35  * the eviction algorithm simple: evict the last page in the list.
  36  * This also make the performance characteristics easy to reason
  37  * about.  Our cache is not so simple.  At any given moment, some
  38  * subset of the blocks in the cache are un-evictable because we
  39  * have handed out a reference to them.  Blocks are only evictable
  40  * when there are no external references active.  This makes
  41  * eviction far more problematic:  we choose to evict the evictable
  42  * blocks that are the "lowest" in the list.
  43  *
  44  * There are times when it is not possible to evict the requested
  45  * space.  In these circumstances we are unable to adjust the cache
  46  * size.  To prevent the cache growing unbounded at these times we
  47  * implement a "cache throttle" that slows the flow of new data
  48  * into the cache until we can make space available.
  49  *
  50  * 2. The Megiddo and Modha model assumes a fixed cache size.
  51  * Pages are evicted when the cache is full and there is a cache
  52  * miss.  Our model has a variable sized cache.  It grows with
  53  * high use, but also tries to react to memory pressure from the
  54  * operating system: decreasing its size when system memory is
  55  * tight.
  56  *
  57  * 3. The Megiddo and Modha model assumes a fixed page size. All
  58  * elements of the cache are therefor exactly the same size.  So
  59  * when adjusting the cache size following a cache miss, its simply
  60  * a matter of choosing a single page to evict.  In our model, we
  61  * have variable sized cache blocks (rangeing from 512 bytes to
  62  * 128K bytes).  We therefor choose a set of blocks to evict to make
  63  * space for a cache miss that approximates as closely as possible
  64  * the space used by the new block.
  65  *
  66  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  67  * by N. Megiddo & D. Modha, FAST 2003
  68  */
  69
  70 /*
  71  * The locking model:
  72  *
  73  * A new reference to a cache buffer can be obtained in two
  74  * ways: 1) via a hash table lookup using the DVA as a key,
  75  * or 2) via one of the ARC lists.  The arc_read() interface
  76  * uses method 1, while the internal arc algorithms for
  77  * adjusting the cache use method 2.  We therefor provide two
  78  * types of locks: 1) the hash table lock array, and 2) the
  79  * arc list locks.
  80  *
  81  * Buffers do not have their own mutexs, rather they rely on the
  82  * hash table mutexs for the bulk of their protection (i.e. most
  83  * fields in the arc_buf_hdr_t are protected by these mutexs).
  84  *
  85  * buf_hash_find() returns the appropriate mutex (held) when it
  86  * locates the requested buffer in the hash table.  It returns
  87  * NULL for the mutex if the buffer was not in the table.
  88  *
  89  * buf_hash_remove() expects the appropriate hash mutex to be
  90  * already held before it is invoked.
  91  *
  92  * Each arc state also has a mutex which is used to protect the
  93  * buffer list associated with the state.  When attempting to
  94  * obtain a hash table lock while holding an arc list lock you
  95  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  96  * the active state mutex must be held before the ghost state mutex.
  97  *
  98  * Arc buffers may have an associated eviction callback function.
  99  * This function will be invoked prior to removing the buffer (e.g.
 100  * in arc_do_user_evicts()).  Note however that the data associated
 101  * with the buffer may be evicted prior to the callback.  The callback
 102  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 103  * the users of callbacks must ensure that their private data is
 104  * protected from simultaneous callbacks from arc_buf_evict()
 105  * and arc_do_user_evicts().
 106  *
 107  * It as also possible to register a callback which is run when the
 108  * arc_meta_limit is reached and no buffers can be safely evicted.  In
 109  * this case the arc user should drop a reference on some arc buffers so
 110  * they can be reclaimed and the arc_meta_limit honored.  For example,
 111  * when using the ZPL each dentry holds a references on a znode.  These
 112  * dentries must be pruned before the arc buffer holding the znode can
 113  * be safely evicted.
 114  *
 115  * Note that the majority of the performance stats are manipulated
 116  * with atomic operations.
 117  *
 118  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 119  *
 120  *      - L2ARC buflist creation
 121  *      - L2ARC buflist eviction
 122  *      - L2ARC write completion, which walks L2ARC buflists
 123  *      - ARC header destruction, as it removes from L2ARC buflists
 124  *      - ARC header release, as it removes from L2ARC buflists
 125  */
 126
 127 #include <sys/spa.h>
 128 #include <sys/zio.h>
 129 #include <sys/zfs_context.h>
 130 #include <sys/arc.h>
 131 #include <sys/vdev.h>
 132 #include <sys/vdev_impl.h>
 133 #ifdef _KERNEL
 134 #include <sys/vmsystm.h>
 135 #include <vm/anon.h>
 136 #include <sys/fs/swapnode.h>
 137 #include <sys/zpl.h>
 138 #endif
 139 #include <sys/callb.h>
 140 #include <sys/kstat.h>
 141 #include <sys/dmu_tx.h>
 142 #include <zfs_fletcher.h>
 143
 144 static kmutex_t         arc_reclaim_thr_lock;
 145 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 146 static uint8_t          arc_thread_exit;
 147
 148 extern int zfs_write_limit_shift;
 149 extern uint64_t zfs_write_limit_max;
 150 extern kmutex_t zfs_write_limit_lock;
 151
 152 /* number of bytes to prune from caches when at arc_meta_limit is reached */
 153 uint_t arc_meta_prune = 1048576;
 154
 155 typedef enum arc_reclaim_strategy {
 156         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 157         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 158 } arc_reclaim_strategy_t;
 159
 160 /* number of seconds before growing cache again */
 161 static int              arc_grow_retry = 5;
 162
 163 /* expiration time for arc_no_grow */
 164 static clock_t          arc_grow_time = 0;
 165
 166 /* shift of arc_c for calculating both min and max arc_p */
 167 static int              arc_p_min_shift = 4;
 168
 169 /* log2(fraction of arc to reclaim) */
 170 static int              arc_shrink_shift = 5;
 171
 172 /*
 173  * minimum lifespan of a prefetch block in clock ticks
 174  * (initialized in arc_init())
 175  */
 176 static int              arc_min_prefetch_lifespan;
 177
 178 static int arc_dead;
 179
 180 /*
 181  * The arc has filled available memory and has now warmed up.
 182  */
 183 static boolean_t arc_warm;
 184
 185 /*
 186  * These tunables are for performance analysis.
 187  */
 188 unsigned long zfs_arc_max = 0;
 189 unsigned long zfs_arc_min = 0;
 190 unsigned long zfs_arc_meta_limit = 0;
 191 int zfs_arc_grow_retry = 0;
 192 int zfs_arc_shrink_shift = 0;
 193 int zfs_arc_p_min_shift = 0;
 194 int zfs_arc_meta_prune = 0;
 195
 196 /*
 197  * Note that buffers can be in one of 6 states:
 198  *      ARC_anon        - anonymous (discussed below)
 199  *      ARC_mru         - recently used, currently cached
 200  *      ARC_mru_ghost   - recentely used, no longer in cache
 201  *      ARC_mfu         - frequently used, currently cached
 202  *      ARC_mfu_ghost   - frequently used, no longer in cache
 203  *      ARC_l2c_only    - exists in L2ARC but not other states
 204  * When there are no active references to the buffer, they are
 205  * are linked onto a list in one of these arc states.  These are
 206  * the only buffers that can be evicted or deleted.  Within each
 207  * state there are multiple lists, one for meta-data and one for
 208  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 209  * etc.) is tracked separately so that it can be managed more
 210  * explicitly: favored over data, limited explicitly.
 211  *
 212  * Anonymous buffers are buffers that are not associated with
 213  * a DVA.  These are buffers that hold dirty block copies
 214  * before they are written to stable storage.  By definition,
 215  * they are "ref'd" and are considered part of arc_mru
 216  * that cannot be freed.  Generally, they will aquire a DVA
 217  * as they are written and migrate onto the arc_mru list.
 218  *
 219  * The ARC_l2c_only state is for buffers that are in the second
 220  * level ARC but no longer in any of the ARC_m* lists.  The second
 221  * level ARC itself may also contain buffers that are in any of
 222  * the ARC_m* states - meaning that a buffer can exist in two
 223  * places.  The reason for the ARC_l2c_only state is to keep the
 224  * buffer header in the hash table, so that reads that hit the
 225  * second level ARC benefit from these fast lookups.
 226  */
 227
 228 typedef struct arc_state {
 229         list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 230         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 231         uint64_t arcs_size;     /* total amount of data in this state */
 232         kmutex_t arcs_mtx;
 233 } arc_state_t;
 234
 235 /* The 6 states: */
 236 static arc_state_t ARC_anon;
 237 static arc_state_t ARC_mru;
 238 static arc_state_t ARC_mru_ghost;
 239 static arc_state_t ARC_mfu;
 240 static arc_state_t ARC_mfu_ghost;
 241 static arc_state_t ARC_l2c_only;
 242
 243 typedef struct arc_stats {
 244         kstat_named_t arcstat_hits;
 245         kstat_named_t arcstat_misses;
 246         kstat_named_t arcstat_demand_data_hits;
 247         kstat_named_t arcstat_demand_data_misses;
 248         kstat_named_t arcstat_demand_metadata_hits;
 249         kstat_named_t arcstat_demand_metadata_misses;
 250         kstat_named_t arcstat_prefetch_data_hits;
 251         kstat_named_t arcstat_prefetch_data_misses;
 252         kstat_named_t arcstat_prefetch_metadata_hits;
 253         kstat_named_t arcstat_prefetch_metadata_misses;
 254         kstat_named_t arcstat_mru_hits;
 255         kstat_named_t arcstat_mru_ghost_hits;
 256         kstat_named_t arcstat_mfu_hits;
 257         kstat_named_t arcstat_mfu_ghost_hits;
 258         kstat_named_t arcstat_deleted;
 259         kstat_named_t arcstat_recycle_miss;
 260         kstat_named_t arcstat_mutex_miss;
 261         kstat_named_t arcstat_evict_skip;
 262         kstat_named_t arcstat_evict_l2_cached;
 263         kstat_named_t arcstat_evict_l2_eligible;
 264         kstat_named_t arcstat_evict_l2_ineligible;
 265         kstat_named_t arcstat_hash_elements;
 266         kstat_named_t arcstat_hash_elements_max;
 267         kstat_named_t arcstat_hash_collisions;
 268         kstat_named_t arcstat_hash_chains;
 269         kstat_named_t arcstat_hash_chain_max;
 270         kstat_named_t arcstat_p;
 271         kstat_named_t arcstat_c;
 272         kstat_named_t arcstat_c_min;
 273         kstat_named_t arcstat_c_max;
 274         kstat_named_t arcstat_size;
 275         kstat_named_t arcstat_hdr_size;
 276         kstat_named_t arcstat_data_size;
 277         kstat_named_t arcstat_other_size;
 278         kstat_named_t arcstat_anon_size;
 279         kstat_named_t arcstat_anon_evict_data;
 280         kstat_named_t arcstat_anon_evict_metadata;
 281         kstat_named_t arcstat_mru_size;
 282         kstat_named_t arcstat_mru_evict_data;
 283         kstat_named_t arcstat_mru_evict_metadata;
 284         kstat_named_t arcstat_mru_ghost_size;
 285         kstat_named_t arcstat_mru_ghost_evict_data;
 286         kstat_named_t arcstat_mru_ghost_evict_metadata;
 287         kstat_named_t arcstat_mfu_size;
 288         kstat_named_t arcstat_mfu_evict_data;
 289         kstat_named_t arcstat_mfu_evict_metadata;
 290         kstat_named_t arcstat_mfu_ghost_size;
 291         kstat_named_t arcstat_mfu_ghost_evict_data;
 292         kstat_named_t arcstat_mfu_ghost_evict_metadata;
 293         kstat_named_t arcstat_l2_hits;
 294         kstat_named_t arcstat_l2_misses;
 295         kstat_named_t arcstat_l2_feeds;
 296         kstat_named_t arcstat_l2_rw_clash;
 297         kstat_named_t arcstat_l2_read_bytes;
 298         kstat_named_t arcstat_l2_write_bytes;
 299         kstat_named_t arcstat_l2_writes_sent;
 300         kstat_named_t arcstat_l2_writes_done;
 301         kstat_named_t arcstat_l2_writes_error;
 302         kstat_named_t arcstat_l2_writes_hdr_miss;
 303         kstat_named_t arcstat_l2_evict_lock_retry;
 304         kstat_named_t arcstat_l2_evict_reading;
 305         kstat_named_t arcstat_l2_free_on_write;
 306         kstat_named_t arcstat_l2_abort_lowmem;
 307         kstat_named_t arcstat_l2_cksum_bad;
 308         kstat_named_t arcstat_l2_io_error;
 309         kstat_named_t arcstat_l2_size;
 310         kstat_named_t arcstat_l2_hdr_size;
 311         kstat_named_t arcstat_memory_throttle_count;
 312         kstat_named_t arcstat_memory_direct_count;
 313         kstat_named_t arcstat_memory_indirect_count;
 314         kstat_named_t arcstat_no_grow;
 315         kstat_named_t arcstat_tempreserve;
 316         kstat_named_t arcstat_loaned_bytes;
 317         kstat_named_t arcstat_prune;
 318         kstat_named_t arcstat_meta_used;
 319         kstat_named_t arcstat_meta_limit;
 320         kstat_named_t arcstat_meta_max;
 321 } arc_stats_t;
 322
 323 static arc_stats_t arc_stats = {
 324         { "hits",                       KSTAT_DATA_UINT64 },
 325         { "misses",                     KSTAT_DATA_UINT64 },
 326         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 327         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 328         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 329         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 330         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 331         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 332         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 333         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 334         { "mru_hits",                   KSTAT_DATA_UINT64 },
 335         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 336         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 337         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 338         { "deleted",                    KSTAT_DATA_UINT64 },
 339         { "recycle_miss",               KSTAT_DATA_UINT64 },
 340         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 341         { "evict_skip",                 KSTAT_DATA_UINT64 },
 342         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 343         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 344         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 345         { "hash_elements",              KSTAT_DATA_UINT64 },
 346         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 347         { "hash_collisions",            KSTAT_DATA_UINT64 },
 348         { "hash_chains",                KSTAT_DATA_UINT64 },
 349         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 350         { "p",                          KSTAT_DATA_UINT64 },
 351         { "c",                          KSTAT_DATA_UINT64 },
 352         { "c_min",                      KSTAT_DATA_UINT64 },
 353         { "c_max",                      KSTAT_DATA_UINT64 },
 354         { "size",                       KSTAT_DATA_UINT64 },
 355         { "hdr_size",                   KSTAT_DATA_UINT64 },
 356         { "data_size",                  KSTAT_DATA_UINT64 },
 357         { "other_size",                 KSTAT_DATA_UINT64 },
 358         { "anon_size",                  KSTAT_DATA_UINT64 },
 359         { "anon_evict_data",            KSTAT_DATA_UINT64 },
 360         { "anon_evict_metadata",        KSTAT_DATA_UINT64 },
 361         { "mru_size",                   KSTAT_DATA_UINT64 },
 362         { "mru_evict_data",             KSTAT_DATA_UINT64 },
 363         { "mru_evict_metadata",         KSTAT_DATA_UINT64 },
 364         { "mru_ghost_size",             KSTAT_DATA_UINT64 },
 365         { "mru_ghost_evict_data",       KSTAT_DATA_UINT64 },
 366         { "mru_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
 367         { "mfu_size",                   KSTAT_DATA_UINT64 },
 368         { "mfu_evict_data",             KSTAT_DATA_UINT64 },
 369         { "mfu_evict_metadata",         KSTAT_DATA_UINT64 },
 370         { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
 371         { "mfu_ghost_evict_data",       KSTAT_DATA_UINT64 },
 372         { "mfu_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
 373         { "l2_hits",                    KSTAT_DATA_UINT64 },
 374         { "l2_misses",                  KSTAT_DATA_UINT64 },
 375         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 376         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 377         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 378         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 379         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 380         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 381         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 382         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 383         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 384         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 385         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 386         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 387         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 388         { "l2_io_error",                KSTAT_DATA_UINT64 },
 389         { "l2_size",                    KSTAT_DATA_UINT64 },
 390         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 391         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 392         { "memory_direct_count",        KSTAT_DATA_UINT64 },
 393         { "memory_indirect_count",      KSTAT_DATA_UINT64 },
 394         { "arc_no_grow",                KSTAT_DATA_UINT64 },
 395         { "arc_tempreserve",            KSTAT_DATA_UINT64 },
 396         { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
 397         { "arc_prune",                  KSTAT_DATA_UINT64 },
 398         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 399         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 400         { "arc_meta_max",               KSTAT_DATA_UINT64 },
 401 };
 402
 403 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 404
 405 #define ARCSTAT_INCR(stat, val) \
 406         atomic_add_64(&arc_stats.stat.value.ui64, (val));
 407
 408 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 409 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 410
 411 #define ARCSTAT_MAX(stat, val) {                                        \
 412         uint64_t m;                                                     \
 413         while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 414             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 415                 continue;                                               \
 416 }
 417
 418 #define ARCSTAT_MAXSTAT(stat) \
 419         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 420
 421 /*
 422  * We define a macro to allow ARC hits/misses to be easily broken down by
 423  * two separate conditions, giving a total of four different subtypes for
 424  * each of hits and misses (so eight statistics total).
 425  */
 426 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 427         if (cond1) {                                                    \
 428                 if (cond2) {                                            \
 429                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 430                 } else {                                                \
 431                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 432                 }                                                       \
 433         } else {                                                        \
 434                 if (cond2) {                                            \
 435                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 436                 } else {                                                \
 437                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 438                 }                                                       \
 439         }
 440
 441 kstat_t                 *arc_ksp;
 442 static arc_state_t      *arc_anon;
 443 static arc_state_t      *arc_mru;
 444 static arc_state_t      *arc_mru_ghost;
 445 static arc_state_t      *arc_mfu;
 446 static arc_state_t      *arc_mfu_ghost;
 447 static arc_state_t      *arc_l2c_only;
 448
 449 /*
 450  * There are several ARC variables that are critical to export as kstats --
 451  * but we don't want to have to grovel around in the kstat whenever we wish to
 452  * manipulate them.  For these variables, we therefore define them to be in
 453  * terms of the statistic variable.  This assures that we are not introducing
 454  * the possibility of inconsistency by having shadow copies of the variables,
 455  * while still allowing the code to be readable.
 456  */
 457 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 458 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 459 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 460 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 461 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 462 #define arc_no_grow     ARCSTAT(arcstat_no_grow)
 463 #define arc_tempreserve ARCSTAT(arcstat_tempreserve)
 464 #define arc_loaned_bytes        ARCSTAT(arcstat_loaned_bytes)
 465 #define arc_meta_used   ARCSTAT(arcstat_meta_used)
 466 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit)
 467 #define arc_meta_max    ARCSTAT(arcstat_meta_max)
 468
 469 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 470
 471 typedef struct arc_callback arc_callback_t;
 472
 473 struct arc_callback {
 474         void                    *acb_private;
 475         arc_done_func_t         *acb_done;
 476         arc_buf_t               *acb_buf;
 477         zio_t                   *acb_zio_dummy;
 478         arc_callback_t          *acb_next;
 479 };
 480
 481 typedef struct arc_write_callback arc_write_callback_t;
 482
 483 struct arc_write_callback {
 484         void            *awcb_private;
 485         arc_done_func_t *awcb_ready;
 486         arc_done_func_t *awcb_done;
 487         arc_buf_t       *awcb_buf;
 488 };
 489
 490 struct arc_buf_hdr {
 491         /* protected by hash lock */
 492         dva_t                   b_dva;
 493         uint64_t                b_birth;
 494         uint64_t                b_cksum0;
 495
 496         kmutex_t                b_freeze_lock;
 497         zio_cksum_t             *b_freeze_cksum;
 498         void                    *b_thawed;
 499
 500         arc_buf_hdr_t           *b_hash_next;
 501         arc_buf_t               *b_buf;
 502         uint32_t                b_flags;
 503         uint32_t                b_datacnt;
 504
 505         arc_callback_t          *b_acb;
 506         kcondvar_t              b_cv;
 507
 508         /* immutable */
 509         arc_buf_contents_t      b_type;
 510         uint64_t                b_size;
 511         uint64_t                b_spa;
 512
 513         /* protected by arc state mutex */
 514         arc_state_t             *b_state;
 515         list_node_t             b_arc_node;
 516
 517         /* updated atomically */
 518         clock_t                 b_arc_access;
 519
 520         /* self protecting */
 521         refcount_t              b_refcnt;
 522
 523         l2arc_buf_hdr_t         *b_l2hdr;
 524         list_node_t             b_l2node;
 525 };
 526
 527 static list_t arc_prune_list;
 528 static kmutex_t arc_prune_mtx;
 529 static arc_buf_t *arc_eviction_list;
 530 static kmutex_t arc_eviction_mtx;
 531 static arc_buf_hdr_t arc_eviction_hdr;
 532 static void arc_get_data_buf(arc_buf_t *buf);
 533 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 534 static int arc_evict_needed(arc_buf_contents_t type);
 535 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 536
 537 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 538
 539 #define GHOST_STATE(state)      \
 540         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 541         (state) == arc_l2c_only)
 542
 543 /*
 544  * Private ARC flags.  These flags are private ARC only flags that will show up
 545  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 546  * be passed in as arc_flags in things like arc_read.  However, these flags
 547  * should never be passed and should only be set by ARC code.  When adding new
 548  * public flags, make sure not to smash the private ones.
 549  */
 550
 551 #define ARC_IN_HASH_TABLE       (1 << 9)        /* this buffer is hashed */
 552 #define ARC_IO_IN_PROGRESS      (1 << 10)       /* I/O in progress for buf */
 553 #define ARC_IO_ERROR            (1 << 11)       /* I/O failed for buf */
 554 #define ARC_FREED_IN_READ       (1 << 12)       /* buf freed while in read */
 555 #define ARC_BUF_AVAILABLE       (1 << 13)       /* block not in active use */
 556 #define ARC_INDIRECT            (1 << 14)       /* this is an indirect block */
 557 #define ARC_FREE_IN_PROGRESS    (1 << 15)       /* hdr about to be freed */
 558 #define ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
 559 #define ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
 560 #define ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
 561
 562 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 563 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 564 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 565 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 566 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 567 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 568 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 569 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 570 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
 571                                     (hdr)->b_l2hdr != NULL)
 572 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 573 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 574 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 575
 576 /*
 577  * Other sizes
 578  */
 579
 580 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 581 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 582
 583 /*
 584  * Hash table routines
 585  */
 586
 587 #define HT_LOCK_ALIGN   64
 588 #define HT_LOCK_PAD     (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
 589
 590 struct ht_lock {
 591         kmutex_t        ht_lock;
 592 #ifdef _KERNEL
 593         unsigned char   pad[HT_LOCK_PAD];
 594 #endif
 595 };
 596
 597 #define BUF_LOCKS 256
 598 typedef struct buf_hash_table {
 599         uint64_t ht_mask;
 600         arc_buf_hdr_t **ht_table;
 601         struct ht_lock ht_locks[BUF_LOCKS];
 602 } buf_hash_table_t;
 603
 604 static buf_hash_table_t buf_hash_table;
 605
 606 #define BUF_HASH_INDEX(spa, dva, birth) \
 607         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 608 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 609 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 610 #define HDR_LOCK(hdr) \
 611         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 612
 613 uint64_t zfs_crc64_table[256];
 614
 615 /*
 616  * Level 2 ARC
 617  */
 618
 619 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 620 #define L2ARC_HEADROOM          2               /* num of writes */
 621 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 622 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 623
 624 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 625 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 626
 627 /*
 628  * L2ARC Performance Tunables
 629  */
 630 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;       /* def max write size */
 631 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;     /* extra warmup write */
 632 unsigned long l2arc_headroom = L2ARC_HEADROOM;          /* # of dev writes */
 633 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;        /* interval seconds */
 634 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;    /* min interval msecs */
 635 int l2arc_noprefetch = B_TRUE;                  /* don't cache prefetch bufs */
 636 int l2arc_feed_again = B_TRUE;                  /* turbo warmup */
 637 int l2arc_norw = B_TRUE;                        /* no reads during writes */
 638
 639 /*
 640  * L2ARC Internals
 641  */
 642 typedef struct l2arc_dev {
 643         vdev_t                  *l2ad_vdev;     /* vdev */
 644         spa_t                   *l2ad_spa;      /* spa */
 645         uint64_t                l2ad_hand;      /* next write location */
 646         uint64_t                l2ad_write;     /* desired write size, bytes */
 647         uint64_t                l2ad_boost;     /* warmup write boost, bytes */
 648         uint64_t                l2ad_start;     /* first addr on device */
 649         uint64_t                l2ad_end;       /* last addr on device */
 650         uint64_t                l2ad_evict;     /* last addr eviction reached */
 651         boolean_t               l2ad_first;     /* first sweep through */
 652         boolean_t               l2ad_writing;   /* currently writing */
 653         list_t                  *l2ad_buflist;  /* buffer list */
 654         list_node_t             l2ad_node;      /* device list node */
 655 } l2arc_dev_t;
 656
 657 static list_t L2ARC_dev_list;                   /* device list */
 658 static list_t *l2arc_dev_list;                  /* device list pointer */
 659 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 660 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 661 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 662 static list_t L2ARC_free_on_write;              /* free after write buf list */
 663 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 664 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 665 static uint64_t l2arc_ndev;                     /* number of devices */
 666
 667 typedef struct l2arc_read_callback {
 668         arc_buf_t       *l2rcb_buf;             /* read buffer */
 669         spa_t           *l2rcb_spa;             /* spa */
 670         blkptr_t        l2rcb_bp;               /* original blkptr */
 671         zbookmark_t     l2rcb_zb;               /* original bookmark */
 672         int             l2rcb_flags;            /* original flags */
 673 } l2arc_read_callback_t;
 674
 675 typedef struct l2arc_write_callback {
 676         l2arc_dev_t     *l2wcb_dev;             /* device info */
 677         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 678 } l2arc_write_callback_t;
 679
 680 struct l2arc_buf_hdr {
 681         /* protected by arc_buf_hdr  mutex */
 682         l2arc_dev_t     *b_dev;                 /* L2ARC device */
 683         uint64_t        b_daddr;                /* disk address, offset byte */
 684 };
 685
 686 typedef struct l2arc_data_free {
 687         /* protected by l2arc_free_on_write_mtx */
 688         void            *l2df_data;
 689         size_t          l2df_size;
 690         void            (*l2df_func)(void *, size_t);
 691         list_node_t     l2df_list_node;
 692 } l2arc_data_free_t;
 693
 694 static kmutex_t l2arc_feed_thr_lock;
 695 static kcondvar_t l2arc_feed_thr_cv;
 696 static uint8_t l2arc_thread_exit;
 697
 698 static void l2arc_read_done(zio_t *zio);
 699 static void l2arc_hdr_stat_add(void);
 700 static void l2arc_hdr_stat_remove(void);
 701
 702 static uint64_t
 703 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 704 {
 705         uint8_t *vdva = (uint8_t *)dva;
 706         uint64_t crc = -1ULL;
 707         int i;
 708
 709         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 710
 711         for (i = 0; i < sizeof (dva_t); i++)
 712                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 713
 714         crc ^= (spa>>8) ^ birth;
 715
 716         return (crc);
 717 }
 718
 719 #define BUF_EMPTY(buf)                                          \
 720         ((buf)->b_dva.dva_word[0] == 0 &&                       \
 721         (buf)->b_dva.dva_word[1] == 0 &&                        \
 722         (buf)->b_birth == 0)
 723
 724 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 725         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 726         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 727         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 728
 729 static void
 730 buf_discard_identity(arc_buf_hdr_t *hdr)
 731 {
 732         hdr->b_dva.dva_word[0] = 0;
 733         hdr->b_dva.dva_word[1] = 0;
 734         hdr->b_birth = 0;
 735         hdr->b_cksum0 = 0;
 736 }
 737
 738 static arc_buf_hdr_t *
 739 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 740 {
 741         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 742         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 743         arc_buf_hdr_t *buf;
 744
 745         mutex_enter(hash_lock);
 746         for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 747             buf = buf->b_hash_next) {
 748                 if (BUF_EQUAL(spa, dva, birth, buf)) {
 749                         *lockp = hash_lock;
 750                         return (buf);
 751                 }
 752         }
 753         mutex_exit(hash_lock);
 754         *lockp = NULL;
 755         return (NULL);
 756 }
 757
 758 /*
 759  * Insert an entry into the hash table.  If there is already an element
 760  * equal to elem in the hash table, then the already existing element
 761  * will be returned and the new element will not be inserted.
 762  * Otherwise returns NULL.
 763  */
 764 static arc_buf_hdr_t *
 765 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 766 {
 767         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 768         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 769         arc_buf_hdr_t *fbuf;
 770         uint32_t i;
 771
 772         ASSERT(!HDR_IN_HASH_TABLE(buf));
 773         *lockp = hash_lock;
 774         mutex_enter(hash_lock);
 775         for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 776             fbuf = fbuf->b_hash_next, i++) {
 777                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 778                         return (fbuf);
 779         }
 780
 781         buf->b_hash_next = buf_hash_table.ht_table[idx];
 782         buf_hash_table.ht_table[idx] = buf;
 783         buf->b_flags |= ARC_IN_HASH_TABLE;
 784
 785         /* collect some hash table performance data */
 786         if (i > 0) {
 787                 ARCSTAT_BUMP(arcstat_hash_collisions);
 788                 if (i == 1)
 789                         ARCSTAT_BUMP(arcstat_hash_chains);
 790
 791                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 792         }
 793
 794         ARCSTAT_BUMP(arcstat_hash_elements);
 795         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 796
 797         return (NULL);
 798 }
 799
 800 static void
 801 buf_hash_remove(arc_buf_hdr_t *buf)
 802 {
 803         arc_buf_hdr_t *fbuf, **bufp;
 804         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 805
 806         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 807         ASSERT(HDR_IN_HASH_TABLE(buf));
 808
 809         bufp = &buf_hash_table.ht_table[idx];
 810         while ((fbuf = *bufp) != buf) {
 811                 ASSERT(fbuf != NULL);
 812                 bufp = &fbuf->b_hash_next;
 813         }
 814         *bufp = buf->b_hash_next;
 815         buf->b_hash_next = NULL;
 816         buf->b_flags &= ~ARC_IN_HASH_TABLE;
 817
 818         /* collect some hash table performance data */
 819         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 820
 821         if (buf_hash_table.ht_table[idx] &&
 822             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 823                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 824 }
 825
 826 /*
 827  * Global data structures and functions for the buf kmem cache.
 828  */
 829 static kmem_cache_t *hdr_cache;
 830 static kmem_cache_t *buf_cache;
 831
 832 static void
 833 buf_fini(void)
 834 {
 835         int i;
 836
 837 #if defined(_KERNEL) && defined(HAVE_SPL)
 838         /* Large allocations which do not require contiguous pages
 839          * should be using vmem_free() in the linux kernel */
 840         vmem_free(buf_hash_table.ht_table,
 841             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 842 #else
 843         kmem_free(buf_hash_table.ht_table,
 844             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 845 #endif
 846         for (i = 0; i < BUF_LOCKS; i++)
 847                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 848         kmem_cache_destroy(hdr_cache);
 849         kmem_cache_destroy(buf_cache);
 850 }
 851
 852 /*
 853  * Constructor callback - called when the cache is empty
 854  * and a new buf is requested.
 855  */
 856 /* ARGSUSED */
 857 static int
 858 hdr_cons(void *vbuf, void *unused, int kmflag)
 859 {
 860         arc_buf_hdr_t *buf = vbuf;
 861
 862         bzero(buf, sizeof (arc_buf_hdr_t));
 863         refcount_create(&buf->b_refcnt);
 864         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 865         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 866         list_link_init(&buf->b_arc_node);
 867         list_link_init(&buf->b_l2node);
 868         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 869
 870         return (0);
 871 }
 872
 873 /* ARGSUSED */
 874 static int
 875 buf_cons(void *vbuf, void *unused, int kmflag)
 876 {
 877         arc_buf_t *buf = vbuf;
 878
 879         bzero(buf, sizeof (arc_buf_t));
 880         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 881         rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
 882         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 883
 884         return (0);
 885 }
 886
 887 /*
 888  * Destructor callback - called when a cached buf is
 889  * no longer required.
 890  */
 891 /* ARGSUSED */
 892 static void
 893 hdr_dest(void *vbuf, void *unused)
 894 {
 895         arc_buf_hdr_t *buf = vbuf;
 896
 897         ASSERT(BUF_EMPTY(buf));
 898         refcount_destroy(&buf->b_refcnt);
 899         cv_destroy(&buf->b_cv);
 900         mutex_destroy(&buf->b_freeze_lock);
 901         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 902 }
 903
 904 /* ARGSUSED */
 905 static void
 906 buf_dest(void *vbuf, void *unused)
 907 {
 908         arc_buf_t *buf = vbuf;
 909
 910         mutex_destroy(&buf->b_evict_lock);
 911         rw_destroy(&buf->b_data_lock);
 912         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 913 }
 914
 915 static void
 916 buf_init(void)
 917 {
 918         uint64_t *ct;
 919         uint64_t hsize = 1ULL << 12;
 920         int i, j;
 921
 922         /*
 923          * The hash table is big enough to fill all of physical memory
 924          * with an average 64K block size.  The table will take up
 925          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 926          */
 927         while (hsize * 65536 < physmem * PAGESIZE)
 928                 hsize <<= 1;
 929 retry:
 930         buf_hash_table.ht_mask = hsize - 1;
 931 #if defined(_KERNEL) && defined(HAVE_SPL)
 932         /* Large allocations which do not require contiguous pages
 933          * should be using vmem_alloc() in the linux kernel */
 934         buf_hash_table.ht_table =
 935             vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
 936 #else
 937         buf_hash_table.ht_table =
 938             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 939 #endif
 940         if (buf_hash_table.ht_table == NULL) {
 941                 ASSERT(hsize > (1ULL << 8));
 942                 hsize >>= 1;
 943                 goto retry;
 944         }
 945
 946         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 947             0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0);
 948         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 949             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 950
 951         for (i = 0; i < 256; i++)
 952                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 953                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 954
 955         for (i = 0; i < BUF_LOCKS; i++) {
 956                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 957                     NULL, MUTEX_DEFAULT, NULL);
 958         }
 959 }
 960
 961 #define ARC_MINTIME     (hz>>4) /* 62 ms */
 962
 963 static void
 964 arc_cksum_verify(arc_buf_t *buf)
 965 {
 966         zio_cksum_t zc;
 967
 968         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 969                 return;
 970
 971         mutex_enter(&buf->b_hdr->b_freeze_lock);
 972         if (buf->b_hdr->b_freeze_cksum == NULL ||
 973             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 974                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 975                 return;
 976         }
 977         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 978         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 979                 panic("buffer modified while frozen!");
 980         mutex_exit(&buf->b_hdr->b_freeze_lock);
 981 }
 982
 983 static int
 984 arc_cksum_equal(arc_buf_t *buf)
 985 {
 986         zio_cksum_t zc;
 987         int equal;
 988
 989         mutex_enter(&buf->b_hdr->b_freeze_lock);
 990         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 991         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 992         mutex_exit(&buf->b_hdr->b_freeze_lock);
 993
 994         return (equal);
 995 }
 996
 997 static void
 998 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 999 {
1000         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1001                 return;
1002
1003         mutex_enter(&buf->b_hdr->b_freeze_lock);
1004         if (buf->b_hdr->b_freeze_cksum != NULL) {
1005                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1006                 return;
1007         }
1008         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1009                                                 KM_PUSHPAGE);
1010         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1011             buf->b_hdr->b_freeze_cksum);
1012         mutex_exit(&buf->b_hdr->b_freeze_lock);
1013 }
1014
1015 void
1016 arc_buf_thaw(arc_buf_t *buf)
1017 {
1018         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1019                 if (buf->b_hdr->b_state != arc_anon)
1020                         panic("modifying non-anon buffer!");
1021                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1022                         panic("modifying buffer while i/o in progress!");
1023                 arc_cksum_verify(buf);
1024         }
1025
1026         mutex_enter(&buf->b_hdr->b_freeze_lock);
1027         if (buf->b_hdr->b_freeze_cksum != NULL) {
1028                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1029                 buf->b_hdr->b_freeze_cksum = NULL;
1030         }
1031
1032         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1033                 if (buf->b_hdr->b_thawed)
1034                         kmem_free(buf->b_hdr->b_thawed, 1);
1035                 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1036         }
1037
1038         mutex_exit(&buf->b_hdr->b_freeze_lock);
1039 }
1040
1041 void
1042 arc_buf_freeze(arc_buf_t *buf)
1043 {
1044         kmutex_t *hash_lock;
1045
1046         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1047                 return;
1048
1049         hash_lock = HDR_LOCK(buf->b_hdr);
1050         mutex_enter(hash_lock);
1051
1052         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1053             buf->b_hdr->b_state == arc_anon);
1054         arc_cksum_compute(buf, B_FALSE);
1055         mutex_exit(hash_lock);
1056 }
1057
1058 static void
1059 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1060 {
1061         ASSERT(MUTEX_HELD(hash_lock));
1062
1063         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1064             (ab->b_state != arc_anon)) {
1065                 uint64_t delta = ab->b_size * ab->b_datacnt;
1066                 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1067                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1068
1069                 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1070                 mutex_enter(&ab->b_state->arcs_mtx);
1071                 ASSERT(list_link_active(&ab->b_arc_node));
1072                 list_remove(list, ab);
1073                 if (GHOST_STATE(ab->b_state)) {
1074                         ASSERT3U(ab->b_datacnt, ==, 0);
1075                         ASSERT3P(ab->b_buf, ==, NULL);
1076                         delta = ab->b_size;
1077                 }
1078                 ASSERT(delta > 0);
1079                 ASSERT3U(*size, >=, delta);
1080                 atomic_add_64(size, -delta);
1081                 mutex_exit(&ab->b_state->arcs_mtx);
1082                 /* remove the prefetch flag if we get a reference */
1083                 if (ab->b_flags & ARC_PREFETCH)
1084                         ab->b_flags &= ~ARC_PREFETCH;
1085         }
1086 }
1087
1088 static int
1089 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1090 {
1091         int cnt;
1092         arc_state_t *state = ab->b_state;
1093
1094         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1095         ASSERT(!GHOST_STATE(state));
1096
1097         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1098             (state != arc_anon)) {
1099                 uint64_t *size = &state->arcs_lsize[ab->b_type];
1100
1101                 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1102                 mutex_enter(&state->arcs_mtx);
1103                 ASSERT(!list_link_active(&ab->b_arc_node));
1104                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1105                 ASSERT(ab->b_datacnt > 0);
1106                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1107                 mutex_exit(&state->arcs_mtx);
1108         }
1109         return (cnt);
1110 }
1111
1112 /*
1113  * Move the supplied buffer to the indicated state.  The mutex
1114  * for the buffer must be held by the caller.
1115  */
1116 static void
1117 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1118 {
1119         arc_state_t *old_state = ab->b_state;
1120         int64_t refcnt = refcount_count(&ab->b_refcnt);
1121         uint64_t from_delta, to_delta;
1122
1123         ASSERT(MUTEX_HELD(hash_lock));
1124         ASSERT(new_state != old_state);
1125         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1126         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1127         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1128
1129         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1130
1131         /*
1132          * If this buffer is evictable, transfer it from the
1133          * old state list to the new state list.
1134          */
1135         if (refcnt == 0) {
1136                 if (old_state != arc_anon) {
1137                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1138                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1139
1140                         if (use_mutex)
1141                                 mutex_enter(&old_state->arcs_mtx);
1142
1143                         ASSERT(list_link_active(&ab->b_arc_node));
1144                         list_remove(&old_state->arcs_list[ab->b_type], ab);
1145
1146                         /*
1147                          * If prefetching out of the ghost cache,
1148                          * we will have a non-zero datacnt.
1149                          */
1150                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1151                                 /* ghost elements have a ghost size */
1152                                 ASSERT(ab->b_buf == NULL);
1153                                 from_delta = ab->b_size;
1154                         }
1155                         ASSERT3U(*size, >=, from_delta);
1156                         atomic_add_64(size, -from_delta);
1157
1158                         if (use_mutex)
1159                                 mutex_exit(&old_state->arcs_mtx);
1160                 }
1161                 if (new_state != arc_anon) {
1162                         int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1163                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1164
1165                         if (use_mutex)
1166                                 mutex_enter(&new_state->arcs_mtx);
1167
1168                         list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1169
1170                         /* ghost elements have a ghost size */
1171                         if (GHOST_STATE(new_state)) {
1172                                 ASSERT(ab->b_datacnt == 0);
1173                                 ASSERT(ab->b_buf == NULL);
1174                                 to_delta = ab->b_size;
1175                         }
1176                         atomic_add_64(size, to_delta);
1177
1178                         if (use_mutex)
1179                                 mutex_exit(&new_state->arcs_mtx);
1180                 }
1181         }
1182
1183         ASSERT(!BUF_EMPTY(ab));
1184         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1185                 buf_hash_remove(ab);
1186
1187         /* adjust state sizes */
1188         if (to_delta)
1189                 atomic_add_64(&new_state->arcs_size, to_delta);
1190         if (from_delta) {
1191                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1192                 atomic_add_64(&old_state->arcs_size, -from_delta);
1193         }
1194         ab->b_state = new_state;
1195
1196         /* adjust l2arc hdr stats */
1197         if (new_state == arc_l2c_only)
1198                 l2arc_hdr_stat_add();
1199         else if (old_state == arc_l2c_only)
1200                 l2arc_hdr_stat_remove();
1201 }
1202
1203 void
1204 arc_space_consume(uint64_t space, arc_space_type_t type)
1205 {
1206         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1207
1208         switch (type) {
1209         default:
1210                 break;
1211         case ARC_SPACE_DATA:
1212                 ARCSTAT_INCR(arcstat_data_size, space);
1213                 break;
1214         case ARC_SPACE_OTHER:
1215                 ARCSTAT_INCR(arcstat_other_size, space);
1216                 break;
1217         case ARC_SPACE_HDRS:
1218                 ARCSTAT_INCR(arcstat_hdr_size, space);
1219                 break;
1220         case ARC_SPACE_L2HDRS:
1221                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1222                 break;
1223         }
1224
1225         atomic_add_64(&arc_meta_used, space);
1226         atomic_add_64(&arc_size, space);
1227 }
1228
1229 void
1230 arc_space_return(uint64_t space, arc_space_type_t type)
1231 {
1232         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1233
1234         switch (type) {
1235         default:
1236                 break;
1237         case ARC_SPACE_DATA:
1238                 ARCSTAT_INCR(arcstat_data_size, -space);
1239                 break;
1240         case ARC_SPACE_OTHER:
1241                 ARCSTAT_INCR(arcstat_other_size, -space);
1242                 break;
1243         case ARC_SPACE_HDRS:
1244                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1245                 break;
1246         case ARC_SPACE_L2HDRS:
1247                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1248                 break;
1249         }
1250
1251         ASSERT(arc_meta_used >= space);
1252         if (arc_meta_max < arc_meta_used)
1253                 arc_meta_max = arc_meta_used;
1254         atomic_add_64(&arc_meta_used, -space);
1255         ASSERT(arc_size >= space);
1256         atomic_add_64(&arc_size, -space);
1257 }
1258
1259 void *
1260 arc_data_buf_alloc(uint64_t size)
1261 {
1262         if (arc_evict_needed(ARC_BUFC_DATA))
1263                 cv_signal(&arc_reclaim_thr_cv);
1264         atomic_add_64(&arc_size, size);
1265         return (zio_data_buf_alloc(size));
1266 }
1267
1268 void
1269 arc_data_buf_free(void *buf, uint64_t size)
1270 {
1271         zio_data_buf_free(buf, size);
1272         ASSERT(arc_size >= size);
1273         atomic_add_64(&arc_size, -size);
1274 }
1275
1276 arc_buf_t *
1277 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1278 {
1279         arc_buf_hdr_t *hdr;
1280         arc_buf_t *buf;
1281
1282         ASSERT3U(size, >, 0);
1283         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1284         ASSERT(BUF_EMPTY(hdr));
1285         hdr->b_size = size;
1286         hdr->b_type = type;
1287         hdr->b_spa = spa_guid(spa);
1288         hdr->b_state = arc_anon;
1289         hdr->b_arc_access = 0;
1290         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1291         buf->b_hdr = hdr;
1292         buf->b_data = NULL;
1293         buf->b_efunc = NULL;
1294         buf->b_private = NULL;
1295         buf->b_next = NULL;
1296         hdr->b_buf = buf;
1297         arc_get_data_buf(buf);
1298         hdr->b_datacnt = 1;
1299         hdr->b_flags = 0;
1300         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1301         (void) refcount_add(&hdr->b_refcnt, tag);
1302
1303         return (buf);
1304 }
1305
1306 static char *arc_onloan_tag = "onloan";
1307
1308 /*
1309  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1310  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1311  * buffers must be returned to the arc before they can be used by the DMU or
1312  * freed.
1313  */
1314 arc_buf_t *
1315 arc_loan_buf(spa_t *spa, int size)
1316 {
1317         arc_buf_t *buf;
1318
1319         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1320
1321         atomic_add_64(&arc_loaned_bytes, size);
1322         return (buf);
1323 }
1324
1325 /*
1326  * Return a loaned arc buffer to the arc.
1327  */
1328 void
1329 arc_return_buf(arc_buf_t *buf, void *tag)
1330 {
1331         arc_buf_hdr_t *hdr = buf->b_hdr;
1332
1333         ASSERT(buf->b_data != NULL);
1334         (void) refcount_add(&hdr->b_refcnt, tag);
1335         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1336
1337         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1338 }
1339
1340 /* Detach an arc_buf from a dbuf (tag) */
1341 void
1342 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1343 {
1344         arc_buf_hdr_t *hdr;
1345
1346         ASSERT(buf->b_data != NULL);
1347         hdr = buf->b_hdr;
1348         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1349         (void) refcount_remove(&hdr->b_refcnt, tag);
1350         buf->b_efunc = NULL;
1351         buf->b_private = NULL;
1352
1353         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1354 }
1355
1356 static arc_buf_t *
1357 arc_buf_clone(arc_buf_t *from)
1358 {
1359         arc_buf_t *buf;
1360         arc_buf_hdr_t *hdr = from->b_hdr;
1361         uint64_t size = hdr->b_size;
1362
1363         ASSERT(hdr->b_state != arc_anon);
1364
1365         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1366         buf->b_hdr = hdr;
1367         buf->b_data = NULL;
1368         buf->b_efunc = NULL;
1369         buf->b_private = NULL;
1370         buf->b_next = hdr->b_buf;
1371         hdr->b_buf = buf;
1372         arc_get_data_buf(buf);
1373         bcopy(from->b_data, buf->b_data, size);
1374         hdr->b_datacnt += 1;
1375         return (buf);
1376 }
1377
1378 void
1379 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1380 {
1381         arc_buf_hdr_t *hdr;
1382         kmutex_t *hash_lock;
1383
1384         /*
1385          * Check to see if this buffer is evicted.  Callers
1386          * must verify b_data != NULL to know if the add_ref
1387          * was successful.
1388          */
1389         mutex_enter(&buf->b_evict_lock);
1390         if (buf->b_data == NULL) {
1391                 mutex_exit(&buf->b_evict_lock);
1392                 return;
1393         }
1394         hash_lock = HDR_LOCK(buf->b_hdr);
1395         mutex_enter(hash_lock);
1396         hdr = buf->b_hdr;
1397         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1398         mutex_exit(&buf->b_evict_lock);
1399
1400         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1401         add_reference(hdr, hash_lock, tag);
1402         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1403         arc_access(hdr, hash_lock);
1404         mutex_exit(hash_lock);
1405         ARCSTAT_BUMP(arcstat_hits);
1406         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1407             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1408             data, metadata, hits);
1409 }
1410
1411 /*
1412  * Free the arc data buffer.  If it is an l2arc write in progress,
1413  * the buffer is placed on l2arc_free_on_write to be freed later.
1414  */
1415 static void
1416 arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
1417     void *data, size_t size)
1418 {
1419         if (HDR_L2_WRITING(hdr)) {
1420                 l2arc_data_free_t *df;
1421                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1422                 df->l2df_data = data;
1423                 df->l2df_size = size;
1424                 df->l2df_func = free_func;
1425                 mutex_enter(&l2arc_free_on_write_mtx);
1426                 list_insert_head(l2arc_free_on_write, df);
1427                 mutex_exit(&l2arc_free_on_write_mtx);
1428                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1429         } else {
1430                 free_func(data, size);
1431         }
1432 }
1433
1434 static void
1435 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1436 {
1437         arc_buf_t **bufp;
1438
1439         /* free up data associated with the buf */
1440         if (buf->b_data) {
1441                 arc_state_t *state = buf->b_hdr->b_state;
1442                 uint64_t size = buf->b_hdr->b_size;
1443                 arc_buf_contents_t type = buf->b_hdr->b_type;
1444
1445                 arc_cksum_verify(buf);
1446
1447                 if (!recycle) {
1448                         if (type == ARC_BUFC_METADATA) {
1449                                 arc_buf_data_free(buf->b_hdr, zio_buf_free,
1450                                     buf->b_data, size);
1451                                 arc_space_return(size, ARC_SPACE_DATA);
1452                         } else {
1453                                 ASSERT(type == ARC_BUFC_DATA);
1454                                 arc_buf_data_free(buf->b_hdr,
1455                                     zio_data_buf_free, buf->b_data, size);
1456                                 ARCSTAT_INCR(arcstat_data_size, -size);
1457                                 atomic_add_64(&arc_size, -size);
1458                         }
1459                 }
1460                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1461                         uint64_t *cnt = &state->arcs_lsize[type];
1462
1463                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1464                         ASSERT(state != arc_anon);
1465
1466                         ASSERT3U(*cnt, >=, size);
1467                         atomic_add_64(cnt, -size);
1468                 }
1469                 ASSERT3U(state->arcs_size, >=, size);
1470                 atomic_add_64(&state->arcs_size, -size);
1471                 buf->b_data = NULL;
1472                 ASSERT(buf->b_hdr->b_datacnt > 0);
1473                 buf->b_hdr->b_datacnt -= 1;
1474         }
1475
1476         /* only remove the buf if requested */
1477         if (!all)
1478                 return;
1479
1480         /* remove the buf from the hdr list */
1481         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1482                 continue;
1483         *bufp = buf->b_next;
1484         buf->b_next = NULL;
1485
1486         ASSERT(buf->b_efunc == NULL);
1487
1488         /* clean up the buf */
1489         buf->b_hdr = NULL;
1490         kmem_cache_free(buf_cache, buf);
1491 }
1492
1493 static void
1494 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1495 {
1496         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1497
1498         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1499         ASSERT3P(hdr->b_state, ==, arc_anon);
1500         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1501
1502         if (l2hdr != NULL) {
1503                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1504                 /*
1505                  * To prevent arc_free() and l2arc_evict() from
1506                  * attempting to free the same buffer at the same time,
1507                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1508                  * give it priority.  l2arc_evict() can't destroy this
1509                  * header while we are waiting on l2arc_buflist_mtx.
1510                  *
1511                  * The hdr may be removed from l2ad_buflist before we
1512                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1513                  */
1514                 if (!buflist_held) {
1515                         mutex_enter(&l2arc_buflist_mtx);
1516                         l2hdr = hdr->b_l2hdr;
1517                 }
1518
1519                 if (l2hdr != NULL) {
1520                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1521                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1522                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1523                         if (hdr->b_state == arc_l2c_only)
1524                                 l2arc_hdr_stat_remove();
1525                         hdr->b_l2hdr = NULL;
1526                 }
1527
1528                 if (!buflist_held)
1529                         mutex_exit(&l2arc_buflist_mtx);
1530         }
1531
1532         if (!BUF_EMPTY(hdr)) {
1533                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1534                 buf_discard_identity(hdr);
1535         }
1536         while (hdr->b_buf) {
1537                 arc_buf_t *buf = hdr->b_buf;
1538
1539                 if (buf->b_efunc) {
1540                         mutex_enter(&arc_eviction_mtx);
1541                         mutex_enter(&buf->b_evict_lock);
1542                         ASSERT(buf->b_hdr != NULL);
1543                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1544                         hdr->b_buf = buf->b_next;
1545                         buf->b_hdr = &arc_eviction_hdr;
1546                         buf->b_next = arc_eviction_list;
1547                         arc_eviction_list = buf;
1548                         mutex_exit(&buf->b_evict_lock);
1549                         mutex_exit(&arc_eviction_mtx);
1550                 } else {
1551                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1552                 }
1553         }
1554         if (hdr->b_freeze_cksum != NULL) {
1555                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1556                 hdr->b_freeze_cksum = NULL;
1557         }
1558         if (hdr->b_thawed) {
1559                 kmem_free(hdr->b_thawed, 1);
1560                 hdr->b_thawed = NULL;
1561         }
1562
1563         ASSERT(!list_link_active(&hdr->b_arc_node));
1564         ASSERT3P(hdr->b_hash_next, ==, NULL);
1565         ASSERT3P(hdr->b_acb, ==, NULL);
1566         kmem_cache_free(hdr_cache, hdr);
1567 }
1568
1569 void
1570 arc_buf_free(arc_buf_t *buf, void *tag)
1571 {
1572         arc_buf_hdr_t *hdr = buf->b_hdr;
1573         int hashed = hdr->b_state != arc_anon;
1574
1575         ASSERT(buf->b_efunc == NULL);
1576         ASSERT(buf->b_data != NULL);
1577
1578         if (hashed) {
1579                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1580
1581                 mutex_enter(hash_lock);
1582                 hdr = buf->b_hdr;
1583                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1584
1585                 (void) remove_reference(hdr, hash_lock, tag);
1586                 if (hdr->b_datacnt > 1) {
1587                         arc_buf_destroy(buf, FALSE, TRUE);
1588                 } else {
1589                         ASSERT(buf == hdr->b_buf);
1590                         ASSERT(buf->b_efunc == NULL);
1591                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1592                 }
1593                 mutex_exit(hash_lock);
1594         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1595                 int destroy_hdr;
1596                 /*
1597                  * We are in the middle of an async write.  Don't destroy
1598                  * this buffer unless the write completes before we finish
1599                  * decrementing the reference count.
1600                  */
1601                 mutex_enter(&arc_eviction_mtx);
1602                 (void) remove_reference(hdr, NULL, tag);
1603                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1604                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1605                 mutex_exit(&arc_eviction_mtx);
1606                 if (destroy_hdr)
1607                         arc_hdr_destroy(hdr);
1608         } else {
1609                 if (remove_reference(hdr, NULL, tag) > 0)
1610                         arc_buf_destroy(buf, FALSE, TRUE);
1611                 else
1612                         arc_hdr_destroy(hdr);
1613         }
1614 }
1615
1616 int
1617 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1618 {
1619         arc_buf_hdr_t *hdr = buf->b_hdr;
1620         kmutex_t *hash_lock = HDR_LOCK(hdr);
1621         int no_callback = (buf->b_efunc == NULL);
1622
1623         if (hdr->b_state == arc_anon) {
1624                 ASSERT(hdr->b_datacnt == 1);
1625                 arc_buf_free(buf, tag);
1626                 return (no_callback);
1627         }
1628
1629         mutex_enter(hash_lock);
1630         hdr = buf->b_hdr;
1631         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1632         ASSERT(hdr->b_state != arc_anon);
1633         ASSERT(buf->b_data != NULL);
1634
1635         (void) remove_reference(hdr, hash_lock, tag);
1636         if (hdr->b_datacnt > 1) {
1637                 if (no_callback)
1638                         arc_buf_destroy(buf, FALSE, TRUE);
1639         } else if (no_callback) {
1640                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1641                 ASSERT(buf->b_efunc == NULL);
1642                 hdr->b_flags |= ARC_BUF_AVAILABLE;
1643         }
1644         ASSERT(no_callback || hdr->b_datacnt > 1 ||
1645             refcount_is_zero(&hdr->b_refcnt));
1646         mutex_exit(hash_lock);
1647         return (no_callback);
1648 }
1649
1650 int
1651 arc_buf_size(arc_buf_t *buf)
1652 {
1653         return (buf->b_hdr->b_size);
1654 }
1655
1656 /*
1657  * Evict buffers from list until we've removed the specified number of
1658  * bytes.  Move the removed buffers to the appropriate evict state.
1659  * If the recycle flag is set, then attempt to "recycle" a buffer:
1660  * - look for a buffer to evict that is `bytes' long.
1661  * - return the data block from this buffer rather than freeing it.
1662  * This flag is used by callers that are trying to make space for a
1663  * new buffer in a full arc cache.
1664  *
1665  * This function makes a "best effort".  It skips over any buffers
1666  * it can't get a hash_lock on, and so may not catch all candidates.
1667  * It may also return without evicting as much space as requested.
1668  */
1669 static void *
1670 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1671     arc_buf_contents_t type)
1672 {
1673         arc_state_t *evicted_state;
1674         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1675         arc_buf_hdr_t *ab, *ab_prev = NULL;
1676         list_t *list = &state->arcs_list[type];
1677         kmutex_t *hash_lock;
1678         boolean_t have_lock;
1679         void *stolen = NULL;
1680
1681         ASSERT(state == arc_mru || state == arc_mfu);
1682
1683         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1684
1685         mutex_enter(&state->arcs_mtx);
1686         mutex_enter(&evicted_state->arcs_mtx);
1687
1688         for (ab = list_tail(list); ab; ab = ab_prev) {
1689                 ab_prev = list_prev(list, ab);
1690                 /* prefetch buffers have a minimum lifespan */
1691                 if (HDR_IO_IN_PROGRESS(ab) ||
1692                     (spa && ab->b_spa != spa) ||
1693                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1694                     ddi_get_lbolt() - ab->b_arc_access <
1695                     arc_min_prefetch_lifespan)) {
1696                         skipped++;
1697                         continue;
1698                 }
1699                 /* "lookahead" for better eviction candidate */
1700                 if (recycle && ab->b_size != bytes &&
1701                     ab_prev && ab_prev->b_size == bytes)
1702                         continue;
1703                 hash_lock = HDR_LOCK(ab);
1704                 have_lock = MUTEX_HELD(hash_lock);
1705                 if (have_lock || mutex_tryenter(hash_lock)) {
1706                         ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
1707                         ASSERT(ab->b_datacnt > 0);
1708                         while (ab->b_buf) {
1709                                 arc_buf_t *buf = ab->b_buf;
1710                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
1711                                         missed += 1;
1712                                         break;
1713                                 }
1714                                 if (buf->b_data) {
1715                                         bytes_evicted += ab->b_size;
1716                                         if (recycle && ab->b_type == type &&
1717                                             ab->b_size == bytes &&
1718                                             !HDR_L2_WRITING(ab)) {
1719                                                 stolen = buf->b_data;
1720                                                 recycle = FALSE;
1721                                         }
1722                                 }
1723                                 if (buf->b_efunc) {
1724                                         mutex_enter(&arc_eviction_mtx);
1725                                         arc_buf_destroy(buf,
1726                                             buf->b_data == stolen, FALSE);
1727                                         ab->b_buf = buf->b_next;
1728                                         buf->b_hdr = &arc_eviction_hdr;
1729                                         buf->b_next = arc_eviction_list;
1730                                         arc_eviction_list = buf;
1731                                         mutex_exit(&arc_eviction_mtx);
1732                                         mutex_exit(&buf->b_evict_lock);
1733                                 } else {
1734                                         mutex_exit(&buf->b_evict_lock);
1735                                         arc_buf_destroy(buf,
1736                                             buf->b_data == stolen, TRUE);
1737                                 }
1738                         }
1739
1740                         if (ab->b_l2hdr) {
1741                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
1742                                     ab->b_size);
1743                         } else {
1744                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
1745                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
1746                                             ab->b_size);
1747                                 } else {
1748                                         ARCSTAT_INCR(
1749                                             arcstat_evict_l2_ineligible,
1750                                             ab->b_size);
1751                                 }
1752                         }
1753
1754                         if (ab->b_datacnt == 0) {
1755                                 arc_change_state(evicted_state, ab, hash_lock);
1756                                 ASSERT(HDR_IN_HASH_TABLE(ab));
1757                                 ab->b_flags |= ARC_IN_HASH_TABLE;
1758                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1759                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1760                         }
1761                         if (!have_lock)
1762                                 mutex_exit(hash_lock);
1763                         if (bytes >= 0 && bytes_evicted >= bytes)
1764                                 break;
1765                 } else {
1766                         missed += 1;
1767                 }
1768         }
1769
1770         mutex_exit(&evicted_state->arcs_mtx);
1771         mutex_exit(&state->arcs_mtx);
1772
1773         if (bytes_evicted < bytes)
1774                 dprintf("only evicted %lld bytes from %x\n",
1775                     (longlong_t)bytes_evicted, state);
1776
1777         if (skipped)
1778                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1779
1780         if (missed)
1781                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1782
1783         /*
1784          * We have just evicted some date into the ghost state, make
1785          * sure we also adjust the ghost state size if necessary.
1786          */
1787         if (arc_no_grow &&
1788             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1789                 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1790                     arc_mru_ghost->arcs_size - arc_c;
1791
1792                 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1793                         int64_t todelete =
1794                             MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1795                         arc_evict_ghost(arc_mru_ghost, 0, todelete);
1796                 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1797                         int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1798                             arc_mru_ghost->arcs_size +
1799                             arc_mfu_ghost->arcs_size - arc_c);
1800                         arc_evict_ghost(arc_mfu_ghost, 0, todelete);
1801                 }
1802         }
1803
1804         return (stolen);
1805 }
1806
1807 /*
1808  * Remove buffers from list until we've removed the specified number of
1809  * bytes.  Destroy the buffers that are removed.
1810  */
1811 static void
1812 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1813 {
1814         arc_buf_hdr_t *ab, *ab_prev;
1815         arc_buf_hdr_t marker;
1816         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1817         kmutex_t *hash_lock;
1818         uint64_t bytes_deleted = 0;
1819         uint64_t bufs_skipped = 0;
1820
1821         ASSERT(GHOST_STATE(state));
1822         bzero(&marker, sizeof(marker));
1823 top:
1824         mutex_enter(&state->arcs_mtx);
1825         for (ab = list_tail(list); ab; ab = ab_prev) {
1826                 ab_prev = list_prev(list, ab);
1827                 if (spa && ab->b_spa != spa)
1828                         continue;
1829
1830                 /* ignore markers */
1831                 if (ab->b_spa == 0)
1832                         continue;
1833
1834                 hash_lock = HDR_LOCK(ab);
1835                 /* caller may be trying to modify this buffer, skip it */
1836                 if (MUTEX_HELD(hash_lock))
1837                         continue;
1838                 if (mutex_tryenter(hash_lock)) {
1839                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
1840                         ASSERT(ab->b_buf == NULL);
1841                         ARCSTAT_BUMP(arcstat_deleted);
1842                         bytes_deleted += ab->b_size;
1843
1844                         if (ab->b_l2hdr != NULL) {
1845                                 /*
1846                                  * This buffer is cached on the 2nd Level ARC;
1847                                  * don't destroy the header.
1848                                  */
1849                                 arc_change_state(arc_l2c_only, ab, hash_lock);
1850                                 mutex_exit(hash_lock);
1851                         } else {
1852                                 arc_change_state(arc_anon, ab, hash_lock);
1853                                 mutex_exit(hash_lock);
1854                                 arc_hdr_destroy(ab);
1855                         }
1856
1857                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1858                         if (bytes >= 0 && bytes_deleted >= bytes)
1859                                 break;
1860                 } else if (bytes < 0) {
1861                         /*
1862                          * Insert a list marker and then wait for the
1863                          * hash lock to become available. Once its
1864                          * available, restart from where we left off.
1865                          */
1866                         list_insert_after(list, ab, &marker);
1867                         mutex_exit(&state->arcs_mtx);
1868                         mutex_enter(hash_lock);
1869                         mutex_exit(hash_lock);
1870                         mutex_enter(&state->arcs_mtx);
1871                         ab_prev = list_prev(list, &marker);
1872                         list_remove(list, &marker);
1873                 } else
1874                         bufs_skipped += 1;
1875         }
1876         mutex_exit(&state->arcs_mtx);
1877
1878         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1879             (bytes < 0 || bytes_deleted < bytes)) {
1880                 list = &state->arcs_list[ARC_BUFC_METADATA];
1881                 goto top;
1882         }
1883
1884         if (bufs_skipped) {
1885                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1886                 ASSERT(bytes >= 0);
1887         }
1888
1889         if (bytes_deleted < bytes)
1890                 dprintf("only deleted %lld bytes from %p\n",
1891                     (longlong_t)bytes_deleted, state);
1892 }
1893
1894 static void
1895 arc_adjust(void)
1896 {
1897         int64_t adjustment, delta;
1898
1899         /*
1900          * Adjust MRU size
1901          */
1902
1903         adjustment = MIN((int64_t)(arc_size - arc_c),
1904             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
1905             arc_p));
1906
1907         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1908                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
1909                 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
1910                 adjustment -= delta;
1911         }
1912
1913         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1914                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
1915                 (void) arc_evict(arc_mru, 0, delta, FALSE,
1916                     ARC_BUFC_METADATA);
1917         }
1918
1919         /*
1920          * Adjust MFU size
1921          */
1922
1923         adjustment = arc_size - arc_c;
1924
1925         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1926                 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
1927                 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
1928                 adjustment -= delta;
1929         }
1930
1931         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1932                 int64_t delta = MIN(adjustment,
1933                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
1934                 (void) arc_evict(arc_mfu, 0, delta, FALSE,
1935                     ARC_BUFC_METADATA);
1936         }
1937
1938         /*
1939          * Adjust ghost lists
1940          */
1941
1942         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
1943
1944         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
1945                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
1946                 arc_evict_ghost(arc_mru_ghost, 0, delta);
1947         }
1948
1949         adjustment =
1950             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
1951
1952         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
1953                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
1954                 arc_evict_ghost(arc_mfu_ghost, 0, delta);
1955         }
1956 }
1957
1958 /*
1959  * Request that arc user drop references so that N bytes can be released
1960  * from the cache.  This provides a mechanism to ensure the arc can honor
1961  * the arc_meta_limit and reclaim buffers which are pinned in the cache
1962  * by higher layers.  (i.e. the zpl)
1963  */
1964 static void
1965 arc_do_user_prune(int64_t adjustment)
1966 {
1967         arc_prune_func_t *func;
1968         void *private;
1969         arc_prune_t *cp, *np;
1970
1971         mutex_enter(&arc_prune_mtx);
1972
1973         cp = list_head(&arc_prune_list);
1974         while (cp != NULL) {
1975                 func = cp->p_pfunc;
1976                 private = cp->p_private;
1977                 np = list_next(&arc_prune_list, cp);
1978                 refcount_add(&cp->p_refcnt, func);
1979                 mutex_exit(&arc_prune_mtx);
1980
1981                 if (func != NULL)
1982                         func(adjustment, private);
1983
1984                 mutex_enter(&arc_prune_mtx);
1985
1986                 /* User removed prune callback concurrently with execution */
1987                 if (refcount_remove(&cp->p_refcnt, func) == 0) {
1988                         ASSERT(!list_link_active(&cp->p_node));
1989                         refcount_destroy(&cp->p_refcnt);
1990                         kmem_free(cp, sizeof (*cp));
1991                 }
1992
1993                 cp = np;
1994         }
1995
1996         ARCSTAT_BUMP(arcstat_prune);
1997         mutex_exit(&arc_prune_mtx);
1998 }
1999
2000 static void
2001 arc_do_user_evicts(void)
2002 {
2003         mutex_enter(&arc_eviction_mtx);
2004         while (arc_eviction_list != NULL) {
2005                 arc_buf_t *buf = arc_eviction_list;
2006                 arc_eviction_list = buf->b_next;
2007                 mutex_enter(&buf->b_evict_lock);
2008                 buf->b_hdr = NULL;
2009                 mutex_exit(&buf->b_evict_lock);
2010                 mutex_exit(&arc_eviction_mtx);
2011
2012                 if (buf->b_efunc != NULL)
2013                         VERIFY(buf->b_efunc(buf) == 0);
2014
2015                 buf->b_efunc = NULL;
2016                 buf->b_private = NULL;
2017                 kmem_cache_free(buf_cache, buf);
2018                 mutex_enter(&arc_eviction_mtx);
2019         }
2020         mutex_exit(&arc_eviction_mtx);
2021 }
2022
2023 /*
2024  * Evict only meta data objects from the cache leaving the data objects.
2025  * This is only used to enforce the tunable arc_meta_limit, if we are
2026  * unable to evict enough buffers notify the user via the prune callback.
2027  */
2028 void
2029 arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
2030 {
2031         int64_t delta;
2032
2033         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2034                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2035                 arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
2036                 adjustment -= delta;
2037         }
2038
2039         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2040                 delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2041                 arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
2042                 adjustment -= delta;
2043         }
2044
2045         if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
2046                 arc_do_user_prune(arc_meta_prune);
2047 }
2048
2049 /*
2050  * Flush all *evictable* data from the cache for the given spa.
2051  * NOTE: this will not touch "active" (i.e. referenced) data.
2052  */
2053 void
2054 arc_flush(spa_t *spa)
2055 {
2056         uint64_t guid = 0;
2057
2058         if (spa)
2059                 guid = spa_guid(spa);
2060
2061         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2062                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2063                 if (spa)
2064                         break;
2065         }
2066         while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2067                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2068                 if (spa)
2069                         break;
2070         }
2071         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2072                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2073                 if (spa)
2074                         break;
2075         }
2076         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2077                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2078                 if (spa)
2079                         break;
2080         }
2081
2082         arc_evict_ghost(arc_mru_ghost, guid, -1);
2083         arc_evict_ghost(arc_mfu_ghost, guid, -1);
2084
2085         mutex_enter(&arc_reclaim_thr_lock);
2086         arc_do_user_evicts();
2087         mutex_exit(&arc_reclaim_thr_lock);
2088         ASSERT(spa || arc_eviction_list == NULL);
2089 }
2090
2091 void
2092 arc_shrink(uint64_t bytes)
2093 {
2094         if (arc_c > arc_c_min) {
2095                 uint64_t to_free;
2096
2097                 to_free = bytes ? bytes : arc_c >> arc_shrink_shift;
2098
2099                 if (arc_c > arc_c_min + to_free)
2100                         atomic_add_64(&arc_c, -to_free);
2101                 else
2102                         arc_c = arc_c_min;
2103
2104                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2105                 if (arc_c > arc_size)
2106                         arc_c = MAX(arc_size, arc_c_min);
2107                 if (arc_p > arc_c)
2108                         arc_p = (arc_c >> 1);
2109                 ASSERT(arc_c >= arc_c_min);
2110                 ASSERT((int64_t)arc_p >= 0);
2111         }
2112
2113         if (arc_size > arc_c)
2114                 arc_adjust();
2115 }
2116
2117 static void
2118 arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
2119 {
2120         size_t                  i;
2121         kmem_cache_t            *prev_cache = NULL;
2122         kmem_cache_t            *prev_data_cache = NULL;
2123         extern kmem_cache_t     *zio_buf_cache[];
2124         extern kmem_cache_t     *zio_data_buf_cache[];
2125
2126         /*
2127          * An aggressive reclamation will shrink the cache size as well as
2128          * reap free buffers from the arc kmem caches.
2129          */
2130         if (strat == ARC_RECLAIM_AGGR)
2131                 arc_shrink(bytes);
2132
2133         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2134                 if (zio_buf_cache[i] != prev_cache) {
2135                         prev_cache = zio_buf_cache[i];
2136                         kmem_cache_reap_now(zio_buf_cache[i]);
2137                 }
2138                 if (zio_data_buf_cache[i] != prev_data_cache) {
2139                         prev_data_cache = zio_data_buf_cache[i];
2140                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2141                 }
2142         }
2143
2144         kmem_cache_reap_now(buf_cache);
2145         kmem_cache_reap_now(hdr_cache);
2146 }
2147
2148 /*
2149  * Unlike other ZFS implementations this thread is only responsible for
2150  * adapting the target ARC size on Linux.  The responsibility for memory
2151  * reclamation has been entirely delegated to the arc_shrinker_func()
2152  * which is registered with the VM.  To reflect this change in behavior
2153  * the arc_reclaim thread has been renamed to arc_adapt.
2154  */
2155 static void
2156 arc_adapt_thread(void)
2157 {
2158         callb_cpr_t             cpr;
2159         int64_t                 prune;
2160
2161         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2162
2163         mutex_enter(&arc_reclaim_thr_lock);
2164         while (arc_thread_exit == 0) {
2165 #ifndef _KERNEL
2166                 arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2167
2168                 if (spa_get_random(100) == 0) {
2169
2170                         if (arc_no_grow) {
2171                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2172                                         last_reclaim = ARC_RECLAIM_AGGR;
2173                                 } else {
2174                                         last_reclaim = ARC_RECLAIM_CONS;
2175                                 }
2176                         } else {
2177                                 arc_no_grow = TRUE;
2178                                 last_reclaim = ARC_RECLAIM_AGGR;
2179                                 membar_producer();
2180                         }
2181
2182                         /* reset the growth delay for every reclaim */
2183                         arc_grow_time = ddi_get_lbolt()+(arc_grow_retry * hz);
2184
2185                         arc_kmem_reap_now(last_reclaim, 0);
2186                         arc_warm = B_TRUE;
2187                 }
2188 #endif /* !_KERNEL */
2189
2190                 /* No recent memory pressure allow the ARC to grow. */
2191                 if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
2192                         arc_no_grow = FALSE;
2193
2194                 /*
2195                  * Keep meta data usage within limits, arc_shrink() is not
2196                  * used to avoid collapsing the arc_c value when only the
2197                  * arc_meta_limit is being exceeded.
2198                  */
2199                 prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
2200                 if (prune > 0)
2201                         arc_adjust_meta(prune, B_TRUE);
2202
2203                 arc_adjust();
2204
2205                 if (arc_eviction_list != NULL)
2206                         arc_do_user_evicts();
2207
2208                 /* block until needed, or one second, whichever is shorter */
2209                 CALLB_CPR_SAFE_BEGIN(&cpr);
2210                 (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
2211                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2212                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2213         }
2214
2215         arc_thread_exit = 0;
2216         cv_broadcast(&arc_reclaim_thr_cv);
2217         CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_thr_lock */
2218         thread_exit();
2219 }
2220
2221 #ifdef _KERNEL
2222 /*
2223  * Determine the amount of memory eligible for eviction contained in the
2224  * ARC. All clean data reported by the ghost lists can always be safely
2225  * evicted. Due to arc_c_min, the same does not hold for all clean data
2226  * contained by the regular mru and mfu lists.
2227  *
2228  * In the case of the regular mru and mfu lists, we need to report as
2229  * much clean data as possible, such that evicting that same reported
2230  * data will not bring arc_size below arc_c_min. Thus, in certain
2231  * circumstances, the total amount of clean data in the mru and mfu
2232  * lists might not actually be evictable.
2233  *
2234  * The following two distinct cases are accounted for:
2235  *
2236  * 1. The sum of the amount of dirty data contained by both the mru and
2237  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
2238  *    is greater than or equal to arc_c_min.
2239  *    (i.e. amount of dirty data >= arc_c_min)
2240  *
2241  *    This is the easy case; all clean data contained by the mru and mfu
2242  *    lists is evictable. Evicting all clean data can only drop arc_size
2243  *    to the amount of dirty data, which is greater than arc_c_min.
2244  *
2245  * 2. The sum of the amount of dirty data contained by both the mru and
2246  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
2247  *    is less than arc_c_min.
2248  *    (i.e. arc_c_min > amount of dirty data)
2249  *
2250  *    2.1. arc_size is greater than or equal arc_c_min.
2251  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
2252  *
2253  *         In this case, not all clean data from the regular mru and mfu
2254  *         lists is actually evictable; we must leave enough clean data
2255  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
2256  *         evictable data from the two lists combined, is exactly the
2257  *         difference between arc_size and arc_c_min.
2258  *
2259  *    2.2. arc_size is less than arc_c_min
2260  *         (i.e. arc_c_min > arc_size > amount of dirty data)
2261  *
2262  *         In this case, none of the data contained in the mru and mfu
2263  *         lists is evictable, even if it's clean. Since arc_size is
2264  *         already below arc_c_min, evicting any more would only
2265  *         increase this negative difference.
2266  */
2267 static uint64_t
2268 arc_evictable_memory(void) {
2269         uint64_t arc_clean =
2270             arc_mru->arcs_lsize[ARC_BUFC_DATA] +
2271             arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
2272             arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
2273             arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
2274         uint64_t ghost_clean =
2275             arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
2276             arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
2277             arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
2278             arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
2279         uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
2280
2281         if (arc_dirty >= arc_c_min)
2282                 return (ghost_clean + arc_clean);
2283
2284         return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
2285 }
2286
2287 static int
2288 __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
2289 {
2290         uint64_t pages;
2291
2292         /* The arc is considered warm once reclaim has occurred */
2293         if (unlikely(arc_warm == B_FALSE))
2294                 arc_warm = B_TRUE;
2295
2296         /* Return the potential number of reclaimable pages */
2297         pages = btop(arc_evictable_memory());
2298         if (sc->nr_to_scan == 0)
2299                 return (pages);
2300
2301         /* Not allowed to perform filesystem reclaim */
2302         if (!(sc->gfp_mask & __GFP_FS))
2303                 return (-1);
2304
2305         /* Reclaim in progress */
2306         if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
2307                 return (-1);
2308
2309         /*
2310          * Evict the requested number of pages by shrinking arc_c the
2311          * requested amount.  If there is nothing left to evict just
2312          * reap whatever we can from the various arc slabs.
2313          */
2314         if (pages > 0) {
2315                 arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
2316                 pages = btop(arc_evictable_memory());
2317         } else {
2318                 arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
2319                 pages = -1;
2320         }
2321
2322         /*
2323          * When direct reclaim is observed it usually indicates a rapid
2324          * increase in memory pressure.  This occurs because the kswapd
2325          * threads were unable to asynchronously keep enough free memory
2326          * available.  In this case set arc_no_grow to briefly pause arc
2327          * growth to avoid compounding the memory pressure.
2328          */
2329         if (current_is_kswapd()) {
2330                 ARCSTAT_BUMP(arcstat_memory_indirect_count);
2331         } else {
2332                 arc_no_grow = B_TRUE;
2333                 arc_grow_time = ddi_get_lbolt() + (arc_grow_retry * hz);
2334                 ARCSTAT_BUMP(arcstat_memory_direct_count);
2335         }
2336
2337         mutex_exit(&arc_reclaim_thr_lock);
2338
2339         return (pages);
2340 }
2341 SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
2342
2343 SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
2344 #endif /* _KERNEL */
2345
2346 /*
2347  * Adapt arc info given the number of bytes we are trying to add and
2348  * the state that we are comming from.  This function is only called
2349  * when we are adding new content to the cache.
2350  */
2351 static void
2352 arc_adapt(int bytes, arc_state_t *state)
2353 {
2354         int mult;
2355         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2356
2357         if (state == arc_l2c_only)
2358                 return;
2359
2360         ASSERT(bytes > 0);
2361         /*
2362          * Adapt the target size of the MRU list:
2363          *      - if we just hit in the MRU ghost list, then increase
2364          *        the target size of the MRU list.
2365          *      - if we just hit in the MFU ghost list, then increase
2366          *        the target size of the MFU list by decreasing the
2367          *        target size of the MRU list.
2368          */
2369         if (state == arc_mru_ghost) {
2370                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2371                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2372                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2373
2374                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2375         } else if (state == arc_mfu_ghost) {
2376                 uint64_t delta;
2377
2378                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2379                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2380                 mult = MIN(mult, 10);
2381
2382                 delta = MIN(bytes * mult, arc_p);
2383                 arc_p = MAX(arc_p_min, arc_p - delta);
2384         }
2385         ASSERT((int64_t)arc_p >= 0);
2386
2387         if (arc_no_grow)
2388                 return;
2389
2390         if (arc_c >= arc_c_max)
2391                 return;
2392
2393         /*
2394          * If we're within (2 * maxblocksize) bytes of the target
2395          * cache size, increment the target cache size
2396          */
2397         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2398                 atomic_add_64(&arc_c, (int64_t)bytes);
2399                 if (arc_c > arc_c_max)
2400                         arc_c = arc_c_max;
2401                 else if (state == arc_anon)
2402                         atomic_add_64(&arc_p, (int64_t)bytes);
2403                 if (arc_p > arc_c)
2404                         arc_p = arc_c;
2405         }
2406         ASSERT((int64_t)arc_p >= 0);
2407 }
2408
2409 /*
2410  * Check if the cache has reached its limits and eviction is required
2411  * prior to insert.
2412  */
2413 static int
2414 arc_evict_needed(arc_buf_contents_t type)
2415 {
2416         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2417                 return (1);
2418
2419 #ifdef _KERNEL
2420         /*
2421          * If zio data pages are being allocated out of a separate heap segment,
2422          * then enforce that the size of available vmem for this area remains
2423          * above about 1/32nd free.
2424          */
2425         if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2426             vmem_size(zio_arena, VMEM_FREE) <
2427             (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2428                 return (1);
2429 #endif
2430
2431         if (arc_no_grow)
2432                 return (1);
2433
2434         return (arc_size > arc_c);
2435 }
2436
2437 /*
2438  * The buffer, supplied as the first argument, needs a data block.
2439  * So, if we are at cache max, determine which cache should be victimized.
2440  * We have the following cases:
2441  *
2442  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2443  * In this situation if we're out of space, but the resident size of the MFU is
2444  * under the limit, victimize the MFU cache to satisfy this insertion request.
2445  *
2446  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2447  * Here, we've used up all of the available space for the MRU, so we need to
2448  * evict from our own cache instead.  Evict from the set of resident MRU
2449  * entries.
2450  *
2451  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2452  * c minus p represents the MFU space in the cache, since p is the size of the
2453  * cache that is dedicated to the MRU.  In this situation there's still space on
2454  * the MFU side, so the MRU side needs to be victimized.
2455  *
2456  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2457  * MFU's resident set is consuming more space than it has been allotted.  In
2458  * this situation, we must victimize our own cache, the MFU, for this insertion.
2459  */
2460 static void
2461 arc_get_data_buf(arc_buf_t *buf)
2462 {
2463         arc_state_t             *state = buf->b_hdr->b_state;
2464         uint64_t                size = buf->b_hdr->b_size;
2465         arc_buf_contents_t      type = buf->b_hdr->b_type;
2466
2467         arc_adapt(size, state);
2468
2469         /*
2470          * We have not yet reached cache maximum size,
2471          * just allocate a new buffer.
2472          */
2473         if (!arc_evict_needed(type)) {
2474                 if (type == ARC_BUFC_METADATA) {
2475                         buf->b_data = zio_buf_alloc(size);
2476                         arc_space_consume(size, ARC_SPACE_DATA);
2477                 } else {
2478                         ASSERT(type == ARC_BUFC_DATA);
2479                         buf->b_data = zio_data_buf_alloc(size);
2480                         ARCSTAT_INCR(arcstat_data_size, size);
2481                         atomic_add_64(&arc_size, size);
2482                 }
2483                 goto out;
2484         }
2485
2486         /*
2487          * If we are prefetching from the mfu ghost list, this buffer
2488          * will end up on the mru list; so steal space from there.
2489          */
2490         if (state == arc_mfu_ghost)
2491                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2492         else if (state == arc_mru_ghost)
2493                 state = arc_mru;
2494
2495         if (state == arc_mru || state == arc_anon) {
2496                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2497                 state = (arc_mfu->arcs_lsize[type] >= size &&
2498                     arc_p > mru_used) ? arc_mfu : arc_mru;
2499         } else {
2500                 /* MFU cases */
2501                 uint64_t mfu_space = arc_c - arc_p;
2502                 state =  (arc_mru->arcs_lsize[type] >= size &&
2503                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2504         }
2505
2506         if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2507                 if (type == ARC_BUFC_METADATA) {
2508                         buf->b_data = zio_buf_alloc(size);
2509                         arc_space_consume(size, ARC_SPACE_DATA);
2510
2511                         /*
2512                          * If we are unable to recycle an existing meta buffer
2513                          * signal the reclaim thread.  It will notify users
2514                          * via the prune callback to drop references.  The
2515                          * prune callback in run in the context of the reclaim
2516                          * thread to avoid deadlocking on the hash_lock.
2517                          */
2518                         cv_signal(&arc_reclaim_thr_cv);
2519                 } else {
2520                         ASSERT(type == ARC_BUFC_DATA);
2521                         buf->b_data = zio_data_buf_alloc(size);
2522                         ARCSTAT_INCR(arcstat_data_size, size);
2523                         atomic_add_64(&arc_size, size);
2524                 }
2525
2526                 ARCSTAT_BUMP(arcstat_recycle_miss);
2527         }
2528         ASSERT(buf->b_data != NULL);
2529 out:
2530         /*
2531          * Update the state size.  Note that ghost states have a
2532          * "ghost size" and so don't need to be updated.
2533          */
2534         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2535                 arc_buf_hdr_t *hdr = buf->b_hdr;
2536
2537                 atomic_add_64(&hdr->b_state->arcs_size, size);
2538                 if (list_link_active(&hdr->b_arc_node)) {
2539                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2540                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2541                 }
2542                 /*
2543                  * If we are growing the cache, and we are adding anonymous
2544                  * data, and we have outgrown arc_p, update arc_p
2545                  */
2546                 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2547                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2548                         arc_p = MIN(arc_c, arc_p + size);
2549         }
2550 }
2551
2552 /*
2553  * This routine is called whenever a buffer is accessed.
2554  * NOTE: the hash lock is dropped in this function.
2555  */
2556 static void
2557 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2558 {
2559         clock_t now;
2560
2561         ASSERT(MUTEX_HELD(hash_lock));
2562
2563         if (buf->b_state == arc_anon) {
2564                 /*
2565                  * This buffer is not in the cache, and does not
2566                  * appear in our "ghost" list.  Add the new buffer
2567                  * to the MRU state.
2568                  */
2569
2570                 ASSERT(buf->b_arc_access == 0);
2571                 buf->b_arc_access = ddi_get_lbolt();
2572                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2573                 arc_change_state(arc_mru, buf, hash_lock);
2574
2575         } else if (buf->b_state == arc_mru) {
2576                 now = ddi_get_lbolt();
2577
2578                 /*
2579                  * If this buffer is here because of a prefetch, then either:
2580                  * - clear the flag if this is a "referencing" read
2581                  *   (any subsequent access will bump this into the MFU state).
2582                  * or
2583                  * - move the buffer to the head of the list if this is
2584                  *   another prefetch (to make it less likely to be evicted).
2585                  */
2586                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2587                         if (refcount_count(&buf->b_refcnt) == 0) {
2588                                 ASSERT(list_link_active(&buf->b_arc_node));
2589                         } else {
2590                                 buf->b_flags &= ~ARC_PREFETCH;
2591                                 ARCSTAT_BUMP(arcstat_mru_hits);
2592                         }
2593                         buf->b_arc_access = now;
2594                         return;
2595                 }
2596
2597                 /*
2598                  * This buffer has been "accessed" only once so far,
2599                  * but it is still in the cache. Move it to the MFU
2600                  * state.
2601                  */
2602                 if (now > buf->b_arc_access + ARC_MINTIME) {
2603                         /*
2604                          * More than 125ms have passed since we
2605                          * instantiated this buffer.  Move it to the
2606                          * most frequently used state.
2607                          */
2608                         buf->b_arc_access = now;
2609                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2610                         arc_change_state(arc_mfu, buf, hash_lock);
2611                 }
2612                 ARCSTAT_BUMP(arcstat_mru_hits);
2613         } else if (buf->b_state == arc_mru_ghost) {
2614                 arc_state_t     *new_state;
2615                 /*
2616                  * This buffer has been "accessed" recently, but
2617                  * was evicted from the cache.  Move it to the
2618                  * MFU state.
2619                  */
2620
2621                 if (buf->b_flags & ARC_PREFETCH) {
2622                         new_state = arc_mru;
2623                         if (refcount_count(&buf->b_refcnt) > 0)
2624                                 buf->b_flags &= ~ARC_PREFETCH;
2625                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2626                 } else {
2627                         new_state = arc_mfu;
2628                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2629                 }
2630
2631                 buf->b_arc_access = ddi_get_lbolt();
2632                 arc_change_state(new_state, buf, hash_lock);
2633
2634                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2635         } else if (buf->b_state == arc_mfu) {
2636                 /*
2637                  * This buffer has been accessed more than once and is
2638                  * still in the cache.  Keep it in the MFU state.
2639                  *
2640                  * NOTE: an add_reference() that occurred when we did
2641                  * the arc_read() will have kicked this off the list.
2642                  * If it was a prefetch, we will explicitly move it to
2643                  * the head of the list now.
2644                  */
2645                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2646                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
2647                         ASSERT(list_link_active(&buf->b_arc_node));
2648                 }
2649                 ARCSTAT_BUMP(arcstat_mfu_hits);
2650                 buf->b_arc_access = ddi_get_lbolt();
2651         } else if (buf->b_state == arc_mfu_ghost) {
2652                 arc_state_t     *new_state = arc_mfu;
2653                 /*
2654                  * This buffer has been accessed more than once but has
2655                  * been evicted from the cache.  Move it back to the
2656                  * MFU state.
2657                  */
2658
2659                 if (buf->b_flags & ARC_PREFETCH) {
2660                         /*
2661                          * This is a prefetch access...
2662                          * move this block back to the MRU state.
2663                          */
2664                         ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
2665                         new_state = arc_mru;
2666                 }
2667
2668                 buf->b_arc_access = ddi_get_lbolt();
2669                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2670                 arc_change_state(new_state, buf, hash_lock);
2671
2672                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2673         } else if (buf->b_state == arc_l2c_only) {
2674                 /*
2675                  * This buffer is on the 2nd Level ARC.
2676                  */
2677
2678                 buf->b_arc_access = ddi_get_lbolt();
2679                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2680                 arc_change_state(arc_mfu, buf, hash_lock);
2681         } else {
2682                 ASSERT(!"invalid arc state");
2683         }
2684 }
2685
2686 /* a generic arc_done_func_t which you can use */
2687 /* ARGSUSED */
2688 void
2689 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2690 {
2691         if (zio == NULL || zio->io_error == 0)
2692                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2693         VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2694 }
2695
2696 /* a generic arc_done_func_t */
2697 void
2698 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2699 {
2700         arc_buf_t **bufp = arg;
2701         if (zio && zio->io_error) {
2702                 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2703                 *bufp = NULL;
2704         } else {
2705                 *bufp = buf;
2706                 ASSERT(buf->b_data);
2707         }
2708 }
2709
2710 static void
2711 arc_read_done(zio_t *zio)
2712 {
2713         arc_buf_hdr_t   *hdr, *found;
2714         arc_buf_t       *buf;
2715         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2716         kmutex_t        *hash_lock;
2717         arc_callback_t  *callback_list, *acb;
2718         int             freeable = FALSE;
2719
2720         buf = zio->io_private;
2721         hdr = buf->b_hdr;
2722
2723         /*
2724          * The hdr was inserted into hash-table and removed from lists
2725          * prior to starting I/O.  We should find this header, since
2726          * it's in the hash table, and it should be legit since it's
2727          * not possible to evict it during the I/O.  The only possible
2728          * reason for it not to be found is if we were freed during the
2729          * read.
2730          */
2731         found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2732             &hash_lock);
2733
2734         ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2735             (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2736             (found == hdr && HDR_L2_READING(hdr)));
2737
2738         hdr->b_flags &= ~ARC_L2_EVICTED;
2739         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2740                 hdr->b_flags &= ~ARC_L2CACHE;
2741
2742         /* byteswap if necessary */
2743         callback_list = hdr->b_acb;
2744         ASSERT(callback_list != NULL);
2745         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2746                 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2747                     byteswap_uint64_array :
2748                     dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
2749                 func(buf->b_data, hdr->b_size);
2750         }
2751
2752         arc_cksum_compute(buf, B_FALSE);
2753
2754         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2755                 /*
2756                  * Only call arc_access on anonymous buffers.  This is because
2757                  * if we've issued an I/O for an evicted buffer, we've already
2758                  * called arc_access (to prevent any simultaneous readers from
2759                  * getting confused).
2760                  */
2761                 arc_access(hdr, hash_lock);
2762         }
2763
2764         /* create copies of the data buffer for the callers */
2765         abuf = buf;
2766         for (acb = callback_list; acb; acb = acb->acb_next) {
2767                 if (acb->acb_done) {
2768                         if (abuf == NULL)
2769                                 abuf = arc_buf_clone(buf);
2770                         acb->acb_buf = abuf;
2771                         abuf = NULL;
2772                 }
2773         }
2774         hdr->b_acb = NULL;
2775         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2776         ASSERT(!HDR_BUF_AVAILABLE(hdr));
2777         if (abuf == buf) {
2778                 ASSERT(buf->b_efunc == NULL);
2779                 ASSERT(hdr->b_datacnt == 1);
2780                 hdr->b_flags |= ARC_BUF_AVAILABLE;
2781         }
2782
2783         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2784
2785         if (zio->io_error != 0) {
2786                 hdr->b_flags |= ARC_IO_ERROR;
2787                 if (hdr->b_state != arc_anon)
2788                         arc_change_state(arc_anon, hdr, hash_lock);
2789                 if (HDR_IN_HASH_TABLE(hdr))
2790                         buf_hash_remove(hdr);
2791                 freeable = refcount_is_zero(&hdr->b_refcnt);
2792         }
2793
2794         /*
2795          * Broadcast before we drop the hash_lock to avoid the possibility
2796          * that the hdr (and hence the cv) might be freed before we get to
2797          * the cv_broadcast().
2798          */
2799         cv_broadcast(&hdr->b_cv);
2800
2801         if (hash_lock) {
2802                 mutex_exit(hash_lock);
2803         } else {
2804                 /*
2805                  * This block was freed while we waited for the read to
2806                  * complete.  It has been removed from the hash table and
2807                  * moved to the anonymous state (so that it won't show up
2808                  * in the cache).
2809                  */
2810                 ASSERT3P(hdr->b_state, ==, arc_anon);
2811                 freeable = refcount_is_zero(&hdr->b_refcnt);
2812         }
2813
2814         /* execute each callback and free its structure */
2815         while ((acb = callback_list) != NULL) {
2816                 if (acb->acb_done)
2817                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2818
2819                 if (acb->acb_zio_dummy != NULL) {
2820                         acb->acb_zio_dummy->io_error = zio->io_error;
2821                         zio_nowait(acb->acb_zio_dummy);
2822                 }
2823
2824                 callback_list = acb->acb_next;
2825                 kmem_free(acb, sizeof (arc_callback_t));
2826         }
2827
2828         if (freeable)
2829                 arc_hdr_destroy(hdr);
2830 }
2831
2832 /*
2833  * "Read" the block block at the specified DVA (in bp) via the
2834  * cache.  If the block is found in the cache, invoke the provided
2835  * callback immediately and return.  Note that the `zio' parameter
2836  * in the callback will be NULL in this case, since no IO was
2837  * required.  If the block is not in the cache pass the read request
2838  * on to the spa with a substitute callback function, so that the
2839  * requested block will be added to the cache.
2840  *
2841  * If a read request arrives for a block that has a read in-progress,
2842  * either wait for the in-progress read to complete (and return the
2843  * results); or, if this is a read with a "done" func, add a record
2844  * to the read to invoke the "done" func when the read completes,
2845  * and return; or just return.
2846  *
2847  * arc_read_done() will invoke all the requested "done" functions
2848  * for readers of this block.
2849  *
2850  * Normal callers should use arc_read and pass the arc buffer and offset
2851  * for the bp.  But if you know you don't need locking, you can use
2852  * arc_read_bp.
2853  */
2854 int
2855 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
2856     arc_done_func_t *done, void *private, int priority, int zio_flags,
2857     uint32_t *arc_flags, const zbookmark_t *zb)
2858 {
2859         int err;
2860
2861         if (pbuf == NULL) {
2862                 /*
2863                  * XXX This happens from traverse callback funcs, for
2864                  * the objset_phys_t block.
2865                  */
2866                 return (arc_read_nolock(pio, spa, bp, done, private, priority,
2867                     zio_flags, arc_flags, zb));
2868         }
2869
2870         ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
2871         ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
2872         rw_enter(&pbuf->b_data_lock, RW_READER);
2873
2874         err = arc_read_nolock(pio, spa, bp, done, private, priority,
2875             zio_flags, arc_flags, zb);
2876         rw_exit(&pbuf->b_data_lock);
2877
2878         return (err);
2879 }
2880
2881 int
2882 arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
2883     arc_done_func_t *done, void *private, int priority, int zio_flags,
2884     uint32_t *arc_flags, const zbookmark_t *zb)
2885 {
2886         arc_buf_hdr_t *hdr;
2887         arc_buf_t *buf = NULL;
2888         kmutex_t *hash_lock;
2889         zio_t *rzio;
2890         uint64_t guid = spa_guid(spa);
2891
2892 top:
2893         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2894             &hash_lock);
2895         if (hdr && hdr->b_datacnt > 0) {
2896
2897                 *arc_flags |= ARC_CACHED;
2898
2899                 if (HDR_IO_IN_PROGRESS(hdr)) {
2900
2901                         if (*arc_flags & ARC_WAIT) {
2902                                 cv_wait(&hdr->b_cv, hash_lock);
2903                                 mutex_exit(hash_lock);
2904                                 goto top;
2905                         }
2906                         ASSERT(*arc_flags & ARC_NOWAIT);
2907
2908                         if (done) {
2909                                 arc_callback_t  *acb = NULL;
2910
2911                                 acb = kmem_zalloc(sizeof (arc_callback_t),
2912                                     KM_PUSHPAGE);
2913                                 acb->acb_done = done;
2914                                 acb->acb_private = private;
2915                                 if (pio != NULL)
2916                                         acb->acb_zio_dummy = zio_null(pio,
2917                                             spa, NULL, NULL, NULL, zio_flags);
2918
2919                                 ASSERT(acb->acb_done != NULL);
2920                                 acb->acb_next = hdr->b_acb;
2921                                 hdr->b_acb = acb;
2922                                 add_reference(hdr, hash_lock, private);
2923                                 mutex_exit(hash_lock);
2924                                 return (0);
2925                         }
2926                         mutex_exit(hash_lock);
2927                         return (0);
2928                 }
2929
2930                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2931
2932                 if (done) {
2933                         add_reference(hdr, hash_lock, private);
2934                         /*
2935                          * If this block is already in use, create a new
2936                          * copy of the data so that we will be guaranteed
2937                          * that arc_release() will always succeed.
2938                          */
2939                         buf = hdr->b_buf;
2940                         ASSERT(buf);
2941                         ASSERT(buf->b_data);
2942                         if (HDR_BUF_AVAILABLE(hdr)) {
2943                                 ASSERT(buf->b_efunc == NULL);
2944                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2945                         } else {
2946                                 buf = arc_buf_clone(buf);
2947                         }
2948
2949                 } else if (*arc_flags & ARC_PREFETCH &&
2950                     refcount_count(&hdr->b_refcnt) == 0) {
2951                         hdr->b_flags |= ARC_PREFETCH;
2952                 }
2953                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2954                 arc_access(hdr, hash_lock);
2955                 if (*arc_flags & ARC_L2CACHE)
2956                         hdr->b_flags |= ARC_L2CACHE;
2957                 mutex_exit(hash_lock);
2958                 ARCSTAT_BUMP(arcstat_hits);
2959                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2960                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2961                     data, metadata, hits);
2962
2963                 if (done)
2964                         done(NULL, buf, private);
2965         } else {
2966                 uint64_t size = BP_GET_LSIZE(bp);
2967                 arc_callback_t  *acb;
2968                 vdev_t *vd = NULL;
2969                 uint64_t addr = -1;
2970                 boolean_t devw = B_FALSE;
2971
2972                 if (hdr == NULL) {
2973                         /* this block is not in the cache */
2974                         arc_buf_hdr_t   *exists;
2975                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2976                         buf = arc_buf_alloc(spa, size, private, type);
2977                         hdr = buf->b_hdr;
2978                         hdr->b_dva = *BP_IDENTITY(bp);
2979                         hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2980                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2981                         exists = buf_hash_insert(hdr, &hash_lock);
2982                         if (exists) {
2983                                 /* somebody beat us to the hash insert */
2984                                 mutex_exit(hash_lock);
2985                                 buf_discard_identity(hdr);
2986                                 (void) arc_buf_remove_ref(buf, private);
2987                                 goto top; /* restart the IO request */
2988                         }
2989                         /* if this is a prefetch, we don't have a reference */
2990                         if (*arc_flags & ARC_PREFETCH) {
2991                                 (void) remove_reference(hdr, hash_lock,
2992                                     private);
2993                                 hdr->b_flags |= ARC_PREFETCH;
2994                         }
2995                         if (*arc_flags & ARC_L2CACHE)
2996                                 hdr->b_flags |= ARC_L2CACHE;
2997                         if (BP_GET_LEVEL(bp) > 0)
2998                                 hdr->b_flags |= ARC_INDIRECT;
2999                 } else {
3000                         /* this block is in the ghost cache */
3001                         ASSERT(GHOST_STATE(hdr->b_state));
3002                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3003                         ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
3004                         ASSERT(hdr->b_buf == NULL);
3005
3006                         /* if this is a prefetch, we don't have a reference */
3007                         if (*arc_flags & ARC_PREFETCH)
3008                                 hdr->b_flags |= ARC_PREFETCH;
3009                         else
3010                                 add_reference(hdr, hash_lock, private);
3011                         if (*arc_flags & ARC_L2CACHE)
3012                                 hdr->b_flags |= ARC_L2CACHE;
3013                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3014                         buf->b_hdr = hdr;
3015                         buf->b_data = NULL;
3016                         buf->b_efunc = NULL;
3017                         buf->b_private = NULL;
3018                         buf->b_next = NULL;
3019                         hdr->b_buf = buf;
3020                         ASSERT(hdr->b_datacnt == 0);
3021                         hdr->b_datacnt = 1;
3022                         arc_get_data_buf(buf);
3023                         arc_access(hdr, hash_lock);
3024                 }
3025
3026                 ASSERT(!GHOST_STATE(hdr->b_state));
3027
3028                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE);
3029                 acb->acb_done = done;
3030                 acb->acb_private = private;
3031
3032                 ASSERT(hdr->b_acb == NULL);
3033                 hdr->b_acb = acb;
3034                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3035
3036                 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
3037                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3038                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3039                         addr = hdr->b_l2hdr->b_daddr;
3040                         /*
3041                          * Lock out device removal.
3042                          */
3043                         if (vdev_is_dead(vd) ||
3044                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3045                                 vd = NULL;
3046                 }
3047
3048                 mutex_exit(hash_lock);
3049
3050                 ASSERT3U(hdr->b_size, ==, size);
3051                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3052                     uint64_t, size, zbookmark_t *, zb);
3053                 ARCSTAT_BUMP(arcstat_misses);
3054                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3055                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3056                     data, metadata, misses);
3057
3058                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3059                         /*
3060                          * Read from the L2ARC if the following are true:
3061                          * 1. The L2ARC vdev was previously cached.
3062                          * 2. This buffer still has L2ARC metadata.
3063                          * 3. This buffer isn't currently writing to the L2ARC.
3064                          * 4. The L2ARC entry wasn't evicted, which may
3065                          *    also have invalidated the vdev.
3066                          * 5. This isn't prefetch and l2arc_noprefetch is set.
3067                          */
3068                         if (hdr->b_l2hdr != NULL &&
3069                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3070                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3071                                 l2arc_read_callback_t *cb;
3072
3073                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3074                                 ARCSTAT_BUMP(arcstat_l2_hits);
3075
3076                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3077                                     KM_PUSHPAGE);
3078                                 cb->l2rcb_buf = buf;
3079                                 cb->l2rcb_spa = spa;
3080                                 cb->l2rcb_bp = *bp;
3081                                 cb->l2rcb_zb = *zb;
3082                                 cb->l2rcb_flags = zio_flags;
3083
3084                                 /*
3085                                  * l2arc read.  The SCL_L2ARC lock will be
3086                                  * released by l2arc_read_done().
3087                                  */
3088                                 rzio = zio_read_phys(pio, vd, addr, size,
3089                                     buf->b_data, ZIO_CHECKSUM_OFF,
3090                                     l2arc_read_done, cb, priority, zio_flags |
3091                                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
3092                                     ZIO_FLAG_DONT_PROPAGATE |
3093                                     ZIO_FLAG_DONT_RETRY, B_FALSE);
3094                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3095                                     zio_t *, rzio);
3096                                 ARCSTAT_INCR(arcstat_l2_read_bytes, size);
3097
3098                                 if (*arc_flags & ARC_NOWAIT) {
3099                                         zio_nowait(rzio);
3100                                         return (0);
3101                                 }
3102
3103                                 ASSERT(*arc_flags & ARC_WAIT);
3104                                 if (zio_wait(rzio) == 0)
3105                                         return (0);
3106
3107                                 /* l2arc read error; goto zio_read() */
3108                         } else {
3109                                 DTRACE_PROBE1(l2arc__miss,
3110                                     arc_buf_hdr_t *, hdr);
3111                                 ARCSTAT_BUMP(arcstat_l2_misses);
3112                                 if (HDR_L2_WRITING(hdr))
3113                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
3114                                 spa_config_exit(spa, SCL_L2ARC, vd);
3115                         }
3116                 } else {
3117                         if (vd != NULL)
3118                                 spa_config_exit(spa, SCL_L2ARC, vd);
3119                         if (l2arc_ndev != 0) {
3120                                 DTRACE_PROBE1(l2arc__miss,
3121                                     arc_buf_hdr_t *, hdr);
3122                                 ARCSTAT_BUMP(arcstat_l2_misses);
3123                         }
3124                 }
3125
3126                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3127                     arc_read_done, buf, priority, zio_flags, zb);
3128
3129                 if (*arc_flags & ARC_WAIT)
3130                         return (zio_wait(rzio));
3131
3132                 ASSERT(*arc_flags & ARC_NOWAIT);
3133                 zio_nowait(rzio);
3134         }
3135         return (0);
3136 }
3137
3138 arc_prune_t *
3139 arc_add_prune_callback(arc_prune_func_t *func, void *private)
3140 {
3141         arc_prune_t *p;
3142
3143         p = kmem_alloc(sizeof(*p), KM_SLEEP);
3144         p->p_pfunc = func;
3145         p->p_private = private;
3146         list_link_init(&p->p_node);
3147         refcount_create(&p->p_refcnt);
3148
3149         mutex_enter(&arc_prune_mtx);
3150         refcount_add(&p->p_refcnt, &arc_prune_list);
3151         list_insert_head(&arc_prune_list, p);
3152         mutex_exit(&arc_prune_mtx);
3153
3154         return (p);
3155 }
3156
3157 void
3158 arc_remove_prune_callback(arc_prune_t *p)
3159 {
3160         mutex_enter(&arc_prune_mtx);
3161         list_remove(&arc_prune_list, p);
3162         if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
3163                 refcount_destroy(&p->p_refcnt);
3164                 kmem_free(p, sizeof (*p));
3165         }
3166         mutex_exit(&arc_prune_mtx);
3167 }
3168
3169 void
3170 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3171 {
3172         ASSERT(buf->b_hdr != NULL);
3173         ASSERT(buf->b_hdr->b_state != arc_anon);
3174         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3175         ASSERT(buf->b_efunc == NULL);
3176         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3177
3178         buf->b_efunc = func;
3179         buf->b_private = private;
3180 }
3181
3182 /*
3183  * This is used by the DMU to let the ARC know that a buffer is
3184  * being evicted, so the ARC should clean up.  If this arc buf
3185  * is not yet in the evicted state, it will be put there.
3186  */
3187 int
3188 arc_buf_evict(arc_buf_t *buf)
3189 {
3190         arc_buf_hdr_t *hdr;
3191         kmutex_t *hash_lock;
3192         arc_buf_t **bufp;
3193
3194         mutex_enter(&buf->b_evict_lock);
3195         hdr = buf->b_hdr;
3196         if (hdr == NULL) {
3197                 /*
3198                  * We are in arc_do_user_evicts().
3199                  */
3200                 ASSERT(buf->b_data == NULL);
3201                 mutex_exit(&buf->b_evict_lock);
3202                 return (0);
3203         } else if (buf->b_data == NULL) {
3204                 arc_buf_t copy = *buf; /* structure assignment */
3205                 /*
3206                  * We are on the eviction list; process this buffer now
3207                  * but let arc_do_user_evicts() do the reaping.
3208                  */
3209                 buf->b_efunc = NULL;
3210                 mutex_exit(&buf->b_evict_lock);
3211                 VERIFY(copy.b_efunc(&copy) == 0);
3212                 return (1);
3213         }
3214         hash_lock = HDR_LOCK(hdr);
3215         mutex_enter(hash_lock);
3216         hdr = buf->b_hdr;
3217         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3218
3219         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3220         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3221
3222         /*
3223          * Pull this buffer off of the hdr
3224          */
3225         bufp = &hdr->b_buf;
3226         while (*bufp != buf)
3227                 bufp = &(*bufp)->b_next;
3228         *bufp = buf->b_next;
3229
3230         ASSERT(buf->b_data != NULL);
3231         arc_buf_destroy(buf, FALSE, FALSE);
3232
3233         if (hdr->b_datacnt == 0) {
3234                 arc_state_t *old_state = hdr->b_state;
3235                 arc_state_t *evicted_state;
3236
3237                 ASSERT(hdr->b_buf == NULL);
3238                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3239
3240                 evicted_state =
3241                     (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3242
3243                 mutex_enter(&old_state->arcs_mtx);
3244                 mutex_enter(&evicted_state->arcs_mtx);
3245
3246                 arc_change_state(evicted_state, hdr, hash_lock);
3247                 ASSERT(HDR_IN_HASH_TABLE(hdr));
3248                 hdr->b_flags |= ARC_IN_HASH_TABLE;
3249                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3250
3251                 mutex_exit(&evicted_state->arcs_mtx);
3252                 mutex_exit(&old_state->arcs_mtx);
3253         }
3254         mutex_exit(hash_lock);
3255         mutex_exit(&buf->b_evict_lock);
3256
3257         VERIFY(buf->b_efunc(buf) == 0);
3258         buf->b_efunc = NULL;
3259         buf->b_private = NULL;
3260         buf->b_hdr = NULL;
3261         buf->b_next = NULL;
3262         kmem_cache_free(buf_cache, buf);
3263         return (1);
3264 }
3265
3266 /*
3267  * Release this buffer from the cache.  This must be done
3268  * after a read and prior to modifying the buffer contents.
3269  * If the buffer has more than one reference, we must make
3270  * a new hdr for the buffer.
3271  */
3272 void
3273 arc_release(arc_buf_t *buf, void *tag)
3274 {
3275         arc_buf_hdr_t *hdr;
3276         kmutex_t *hash_lock = NULL;
3277         l2arc_buf_hdr_t *l2hdr;
3278         uint64_t buf_size = 0;
3279
3280         /*
3281          * It would be nice to assert that if it's DMU metadata (level >
3282          * 0 || it's the dnode file), then it must be syncing context.
3283          * But we don't know that information at this level.
3284          */
3285
3286         mutex_enter(&buf->b_evict_lock);
3287         hdr = buf->b_hdr;
3288
3289         /* this buffer is not on any list */
3290         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3291
3292         if (hdr->b_state == arc_anon) {
3293                 /* this buffer is already released */
3294                 ASSERT(buf->b_efunc == NULL);
3295         } else {
3296                 hash_lock = HDR_LOCK(hdr);
3297                 mutex_enter(hash_lock);
3298                 hdr = buf->b_hdr;
3299                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3300         }
3301
3302         l2hdr = hdr->b_l2hdr;
3303         if (l2hdr) {
3304                 mutex_enter(&l2arc_buflist_mtx);
3305                 hdr->b_l2hdr = NULL;
3306                 buf_size = hdr->b_size;
3307         }
3308
3309         /*
3310          * Do we have more than one buf?
3311          */
3312         if (hdr->b_datacnt > 1) {
3313                 arc_buf_hdr_t *nhdr;
3314                 arc_buf_t **bufp;
3315                 uint64_t blksz = hdr->b_size;
3316                 uint64_t spa = hdr->b_spa;
3317                 arc_buf_contents_t type = hdr->b_type;
3318                 uint32_t flags = hdr->b_flags;
3319
3320                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3321                 /*
3322                  * Pull the data off of this hdr and attach it to
3323                  * a new anonymous hdr.
3324                  */
3325                 (void) remove_reference(hdr, hash_lock, tag);
3326                 bufp = &hdr->b_buf;
3327                 while (*bufp != buf)
3328                         bufp = &(*bufp)->b_next;
3329                 *bufp = buf->b_next;
3330                 buf->b_next = NULL;
3331
3332                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3333                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3334                 if (refcount_is_zero(&hdr->b_refcnt)) {
3335                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3336                         ASSERT3U(*size, >=, hdr->b_size);
3337                         atomic_add_64(size, -hdr->b_size);
3338                 }
3339                 hdr->b_datacnt -= 1;
3340                 arc_cksum_verify(buf);
3341
3342                 mutex_exit(hash_lock);
3343
3344                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3345                 nhdr->b_size = blksz;
3346                 nhdr->b_spa = spa;
3347                 nhdr->b_type = type;
3348                 nhdr->b_buf = buf;
3349                 nhdr->b_state = arc_anon;
3350                 nhdr->b_arc_access = 0;
3351                 nhdr->b_flags = flags & ARC_L2_WRITING;
3352                 nhdr->b_l2hdr = NULL;
3353                 nhdr->b_datacnt = 1;
3354                 nhdr->b_freeze_cksum = NULL;
3355                 (void) refcount_add(&nhdr->b_refcnt, tag);
3356                 buf->b_hdr = nhdr;
3357                 mutex_exit(&buf->b_evict_lock);
3358                 atomic_add_64(&arc_anon->arcs_size, blksz);
3359         } else {
3360                 mutex_exit(&buf->b_evict_lock);
3361                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3362                 ASSERT(!list_link_active(&hdr->b_arc_node));
3363                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3364                 if (hdr->b_state != arc_anon)
3365                         arc_change_state(arc_anon, hdr, hash_lock);
3366                 hdr->b_arc_access = 0;
3367                 if (hash_lock)
3368                         mutex_exit(hash_lock);
3369
3370                 buf_discard_identity(hdr);
3371                 arc_buf_thaw(buf);
3372         }
3373         buf->b_efunc = NULL;
3374         buf->b_private = NULL;
3375
3376         if (l2hdr) {
3377                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3378                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3379                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3380                 mutex_exit(&l2arc_buflist_mtx);
3381         }
3382 }
3383
3384 /*
3385  * Release this buffer.  If it does not match the provided BP, fill it
3386  * with that block's contents.
3387  */
3388 /* ARGSUSED */
3389 int
3390 arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
3391     zbookmark_t *zb)
3392 {
3393         arc_release(buf, tag);
3394         return (0);
3395 }
3396
3397 int
3398 arc_released(arc_buf_t *buf)
3399 {
3400         int released;
3401
3402         mutex_enter(&buf->b_evict_lock);
3403         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3404         mutex_exit(&buf->b_evict_lock);
3405         return (released);
3406 }
3407
3408 int
3409 arc_has_callback(arc_buf_t *buf)
3410 {
3411         int callback;
3412
3413         mutex_enter(&buf->b_evict_lock);
3414         callback = (buf->b_efunc != NULL);
3415         mutex_exit(&buf->b_evict_lock);
3416         return (callback);
3417 }
3418
3419 #ifdef ZFS_DEBUG
3420 int
3421 arc_referenced(arc_buf_t *buf)
3422 {
3423         int referenced;
3424
3425         mutex_enter(&buf->b_evict_lock);
3426         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3427         mutex_exit(&buf->b_evict_lock);
3428         return (referenced);
3429 }
3430 #endif
3431
3432 static void
3433 arc_write_ready(zio_t *zio)
3434 {
3435         arc_write_callback_t *callback = zio->io_private;
3436         arc_buf_t *buf = callback->awcb_buf;
3437         arc_buf_hdr_t *hdr = buf->b_hdr;
3438
3439         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3440         callback->awcb_ready(zio, buf, callback->awcb_private);
3441
3442         /*
3443          * If the IO is already in progress, then this is a re-write
3444          * attempt, so we need to thaw and re-compute the cksum.
3445          * It is the responsibility of the callback to handle the
3446          * accounting for any re-write attempt.
3447          */
3448         if (HDR_IO_IN_PROGRESS(hdr)) {
3449                 mutex_enter(&hdr->b_freeze_lock);
3450                 if (hdr->b_freeze_cksum != NULL) {
3451                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3452                         hdr->b_freeze_cksum = NULL;
3453                 }
3454                 mutex_exit(&hdr->b_freeze_lock);
3455         }
3456         arc_cksum_compute(buf, B_FALSE);
3457         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3458 }
3459
3460 static void
3461 arc_write_done(zio_t *zio)
3462 {
3463         arc_write_callback_t *callback = zio->io_private;
3464         arc_buf_t *buf = callback->awcb_buf;
3465         arc_buf_hdr_t *hdr = buf->b_hdr;
3466
3467         ASSERT(hdr->b_acb == NULL);
3468
3469         if (zio->io_error == 0) {
3470                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3471                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3472                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3473         } else {
3474                 ASSERT(BUF_EMPTY(hdr));
3475         }
3476
3477         /*
3478          * If the block to be written was all-zero, we may have
3479          * compressed it away.  In this case no write was performed
3480          * so there will be no dva/birth/checksum.  The buffer must
3481          * therefore remain anonymous (and uncached).
3482          */
3483         if (!BUF_EMPTY(hdr)) {
3484                 arc_buf_hdr_t *exists;
3485                 kmutex_t *hash_lock;
3486
3487                 ASSERT(zio->io_error == 0);
3488
3489                 arc_cksum_verify(buf);
3490
3491                 exists = buf_hash_insert(hdr, &hash_lock);
3492                 if (exists) {
3493                         /*
3494                          * This can only happen if we overwrite for
3495                          * sync-to-convergence, because we remove
3496                          * buffers from the hash table when we arc_free().
3497                          */
3498                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3499                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3500                                         panic("bad overwrite, hdr=%p exists=%p",
3501                                             (void *)hdr, (void *)exists);
3502                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
3503                                 arc_change_state(arc_anon, exists, hash_lock);
3504                                 mutex_exit(hash_lock);
3505                                 arc_hdr_destroy(exists);
3506                                 exists = buf_hash_insert(hdr, &hash_lock);
3507                                 ASSERT3P(exists, ==, NULL);
3508                         } else {
3509                                 /* Dedup */
3510                                 ASSERT(hdr->b_datacnt == 1);
3511                                 ASSERT(hdr->b_state == arc_anon);
3512                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
3513                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3514                         }
3515                 }
3516                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3517                 /* if it's not anon, we are doing a scrub */
3518                 if (!exists && hdr->b_state == arc_anon)
3519                         arc_access(hdr, hash_lock);
3520                 mutex_exit(hash_lock);
3521         } else {
3522                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3523         }
3524
3525         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3526         callback->awcb_done(zio, buf, callback->awcb_private);
3527
3528         kmem_free(callback, sizeof (arc_write_callback_t));
3529 }
3530
3531 zio_t *
3532 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3533     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
3534     arc_done_func_t *ready, arc_done_func_t *done, void *private,
3535     int priority, int zio_flags, const zbookmark_t *zb)
3536 {
3537         arc_buf_hdr_t *hdr = buf->b_hdr;
3538         arc_write_callback_t *callback;
3539         zio_t *zio;
3540
3541         ASSERT(ready != NULL);
3542         ASSERT(done != NULL);
3543         ASSERT(!HDR_IO_ERROR(hdr));
3544         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3545         ASSERT(hdr->b_acb == NULL);
3546         if (l2arc)
3547                 hdr->b_flags |= ARC_L2CACHE;
3548         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3549         callback->awcb_ready = ready;
3550         callback->awcb_done = done;
3551         callback->awcb_private = private;
3552         callback->awcb_buf = buf;
3553
3554         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3555             arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3556
3557         return (zio);
3558 }
3559
3560 static int
3561 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3562 {
3563 #ifdef _KERNEL
3564         uint64_t available_memory;
3565
3566         /* Easily reclaimable memory (free + inactive + arc-evictable) */
3567         available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();
3568 #if defined(__i386)
3569         available_memory =
3570             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3571 #endif
3572
3573         if (available_memory <= zfs_write_limit_max) {
3574                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3575                 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
3576                 return (EAGAIN);
3577         }
3578
3579         if (inflight_data > available_memory / 4) {
3580                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3581                 DMU_TX_STAT_BUMP(dmu_tx_memory_inflight);
3582                 return (ERESTART);
3583         }
3584 #endif
3585         return (0);
3586 }
3587
3588 void
3589 arc_tempreserve_clear(uint64_t reserve)
3590 {
3591         atomic_add_64(&arc_tempreserve, -reserve);
3592         ASSERT((int64_t)arc_tempreserve >= 0);
3593 }
3594
3595 int
3596 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3597 {
3598         int error;
3599         uint64_t anon_size;
3600
3601 #ifdef ZFS_DEBUG
3602         /*
3603          * Once in a while, fail for no reason.  Everything should cope.
3604          */
3605         if (spa_get_random(10000) == 0) {
3606                 dprintf("forcing random failure\n");
3607                 return (ERESTART);
3608         }
3609 #endif
3610         if (reserve > arc_c/4 && !arc_no_grow)
3611                 arc_c = MIN(arc_c_max, reserve * 4);
3612         if (reserve > arc_c) {
3613                 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
3614                 return (ENOMEM);
3615         }
3616
3617         /*
3618          * Don't count loaned bufs as in flight dirty data to prevent long
3619          * network delays from blocking transactions that are ready to be
3620          * assigned to a txg.
3621          */
3622         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3623
3624         /*
3625          * Writes will, almost always, require additional memory allocations
3626          * in order to compress/encrypt/etc the data.  We therefor need to
3627          * make sure that there is sufficient available memory for this.
3628          */
3629         if ((error = arc_memory_throttle(reserve, anon_size, txg)))
3630                 return (error);
3631
3632         /*
3633          * Throttle writes when the amount of dirty data in the cache
3634          * gets too large.  We try to keep the cache less than half full
3635          * of dirty blocks so that our sync times don't grow too large.
3636          * Note: if two requests come in concurrently, we might let them
3637          * both succeed, when one of them should fail.  Not a huge deal.
3638          */
3639
3640         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3641             anon_size > arc_c / 4) {
3642                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3643                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3644                     arc_tempreserve>>10,
3645                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3646                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3647                     reserve>>10, arc_c>>10);
3648                 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
3649                 return (ERESTART);
3650         }
3651         atomic_add_64(&arc_tempreserve, reserve);
3652         return (0);
3653 }
3654
3655 static void
3656 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
3657     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
3658 {
3659         size->value.ui64 = state->arcs_size;
3660         evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
3661         evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
3662 }
3663
3664 static int
3665 arc_kstat_update(kstat_t *ksp, int rw)
3666 {
3667         arc_stats_t *as = ksp->ks_data;
3668
3669         if (rw == KSTAT_WRITE) {
3670                 return (EACCES);
3671         } else {
3672                 arc_kstat_update_state(arc_anon,
3673                     &as->arcstat_anon_size,
3674                     &as->arcstat_anon_evict_data,
3675                     &as->arcstat_anon_evict_metadata);
3676                 arc_kstat_update_state(arc_mru,
3677                     &as->arcstat_mru_size,
3678                     &as->arcstat_mru_evict_data,
3679                     &as->arcstat_mru_evict_metadata);
3680                 arc_kstat_update_state(arc_mru_ghost,
3681                     &as->arcstat_mru_ghost_size,
3682                     &as->arcstat_mru_ghost_evict_data,
3683                     &as->arcstat_mru_ghost_evict_metadata);
3684                 arc_kstat_update_state(arc_mfu,
3685                     &as->arcstat_mfu_size,
3686                     &as->arcstat_mfu_evict_data,
3687                     &as->arcstat_mfu_evict_metadata);
3688                 arc_kstat_update_state(arc_mfu_ghost,
3689                     &as->arcstat_mfu_ghost_size,
3690                     &as->arcstat_mfu_ghost_evict_data,
3691                     &as->arcstat_mfu_ghost_evict_metadata);
3692         }
3693
3694         return (0);
3695 }
3696
3697 void
3698 arc_init(void)
3699 {
3700         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3701         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3702
3703         /* Convert seconds to clock ticks */
3704         arc_min_prefetch_lifespan = 1 * hz;
3705
3706         /* Start out with 1/8 of all memory */
3707         arc_c = physmem * PAGESIZE / 8;
3708
3709 #ifdef _KERNEL
3710         /*
3711          * On architectures where the physical memory can be larger
3712          * than the addressable space (intel in 32-bit mode), we may
3713          * need to limit the cache to 1/8 of VM size.
3714          */
3715         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3716         /*
3717          * Register a shrinker to support synchronous (direct) memory
3718          * reclaim from the arc.  This is done to prevent kswapd from
3719          * swapping out pages when it is preferable to shrink the arc.
3720          */
3721         spl_register_shrinker(&arc_shrinker);
3722 #endif
3723
3724         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3725         arc_c_min = MAX(arc_c / 4, 64<<20);
3726         /* set max to 1/2 of all memory */
3727         arc_c_max = MAX(arc_c * 4, arc_c_max);
3728
3729         /*
3730          * Allow the tunables to override our calculations if they are
3731          * reasonable (ie. over 64MB)
3732          */
3733         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3734                 arc_c_max = zfs_arc_max;
3735         if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3736                 arc_c_min = zfs_arc_min;
3737
3738         arc_c = arc_c_max;
3739         arc_p = (arc_c >> 1);
3740
3741         /* limit meta-data to 1/4 of the arc capacity */
3742         arc_meta_limit = arc_c_max / 4;
3743         arc_meta_max = 0;
3744
3745         /* Allow the tunable to override if it is reasonable */
3746         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3747                 arc_meta_limit = zfs_arc_meta_limit;
3748
3749         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3750                 arc_c_min = arc_meta_limit / 2;
3751
3752         if (zfs_arc_grow_retry > 0)
3753                 arc_grow_retry = zfs_arc_grow_retry;
3754
3755         if (zfs_arc_shrink_shift > 0)
3756                 arc_shrink_shift = zfs_arc_shrink_shift;
3757
3758         if (zfs_arc_p_min_shift > 0)
3759                 arc_p_min_shift = zfs_arc_p_min_shift;
3760
3761         if (zfs_arc_meta_prune > 0)
3762                 arc_meta_prune = zfs_arc_meta_prune;
3763
3764         /* if kmem_flags are set, lets try to use less memory */
3765         if (kmem_debugging())
3766                 arc_c = arc_c / 2;
3767         if (arc_c < arc_c_min)
3768                 arc_c = arc_c_min;
3769
3770         arc_anon = &ARC_anon;
3771         arc_mru = &ARC_mru;
3772         arc_mru_ghost = &ARC_mru_ghost;
3773         arc_mfu = &ARC_mfu;
3774         arc_mfu_ghost = &ARC_mfu_ghost;
3775         arc_l2c_only = &ARC_l2c_only;
3776         arc_size = 0;
3777
3778         mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3779         mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3780         mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3781         mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3782         mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3783         mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3784
3785         list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3786             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3787         list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3788             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3789         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3790             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3791         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3792             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3793         list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3794             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3795         list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3796             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3797         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3798             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3799         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3800             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3801         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3802             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3803         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3804             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3805
3806         buf_init();
3807
3808         arc_thread_exit = 0;
3809         list_create(&arc_prune_list, sizeof (arc_prune_t),
3810             offsetof(arc_prune_t, p_node));
3811         arc_eviction_list = NULL;
3812         mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
3813         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3814         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3815
3816         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3817             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3818
3819         if (arc_ksp != NULL) {
3820                 arc_ksp->ks_data = &arc_stats;
3821                 arc_ksp->ks_update = arc_kstat_update;
3822                 kstat_install(arc_ksp);
3823         }
3824
3825         (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
3826             TS_RUN, minclsyspri);
3827
3828         arc_dead = FALSE;
3829         arc_warm = B_FALSE;
3830
3831         if (zfs_write_limit_max == 0)
3832                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3833         else
3834                 zfs_write_limit_shift = 0;
3835         mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3836 }
3837
3838 void
3839 arc_fini(void)
3840 {
3841         arc_prune_t *p;
3842
3843         mutex_enter(&arc_reclaim_thr_lock);
3844 #ifdef _KERNEL
3845         spl_unregister_shrinker(&arc_shrinker);
3846 #endif /* _KERNEL */
3847
3848         arc_thread_exit = 1;
3849         while (arc_thread_exit != 0)
3850                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3851         mutex_exit(&arc_reclaim_thr_lock);
3852
3853         arc_flush(NULL);
3854
3855         arc_dead = TRUE;
3856
3857         if (arc_ksp != NULL) {
3858                 kstat_delete(arc_ksp);
3859                 arc_ksp = NULL;
3860         }
3861
3862         mutex_enter(&arc_prune_mtx);
3863         while ((p = list_head(&arc_prune_list)) != NULL) {
3864                 list_remove(&arc_prune_list, p);
3865                 refcount_remove(&p->p_refcnt, &arc_prune_list);
3866                 refcount_destroy(&p->p_refcnt);
3867                 kmem_free(p, sizeof (*p));
3868         }
3869         mutex_exit(&arc_prune_mtx);
3870
3871         list_destroy(&arc_prune_list);
3872         mutex_destroy(&arc_prune_mtx);
3873         mutex_destroy(&arc_eviction_mtx);
3874         mutex_destroy(&arc_reclaim_thr_lock);
3875         cv_destroy(&arc_reclaim_thr_cv);
3876
3877         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3878         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3879         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3880         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3881         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3882         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3883         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3884         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3885
3886         mutex_destroy(&arc_anon->arcs_mtx);
3887         mutex_destroy(&arc_mru->arcs_mtx);
3888         mutex_destroy(&arc_mru_ghost->arcs_mtx);
3889         mutex_destroy(&arc_mfu->arcs_mtx);
3890         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3891         mutex_destroy(&arc_l2c_only->arcs_mtx);
3892
3893         mutex_destroy(&zfs_write_limit_lock);
3894
3895         buf_fini();
3896
3897         ASSERT(arc_loaned_bytes == 0);
3898 }
3899
3900 /*
3901  * Level 2 ARC
3902  *
3903  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3904  * It uses dedicated storage devices to hold cached data, which are populated
3905  * using large infrequent writes.  The main role of this cache is to boost
3906  * the performance of random read workloads.  The intended L2ARC devices
3907  * include short-stroked disks, solid state disks, and other media with
3908  * substantially faster read latency than disk.
3909  *
3910  *                 +-----------------------+
3911  *                 |         ARC           |
3912  *                 +-----------------------+
3913  *                    |         ^     ^
3914  *                    |         |     |
3915  *      l2arc_feed_thread()    arc_read()
3916  *                    |         |     |
3917  *                    |  l2arc read   |
3918  *                    V         |     |
3919  *               +---------------+    |
3920  *               |     L2ARC     |    |
3921  *               +---------------+    |
3922  *                   |    ^           |
3923  *          l2arc_write() |           |
3924  *                   |    |           |
3925  *                   V    |           |
3926  *                 +-------+      +-------+
3927  *                 | vdev  |      | vdev  |
3928  *                 | cache |      | cache |
3929  *                 +-------+      +-------+
3930  *                 +=========+     .-----.
3931  *                 :  L2ARC  :    |-_____-|
3932  *                 : devices :    | Disks |
3933  *                 +=========+    `-_____-'
3934  *
3935  * Read requests are satisfied from the following sources, in order:
3936  *
3937  *      1) ARC
3938  *      2) vdev cache of L2ARC devices
3939  *      3) L2ARC devices
3940  *      4) vdev cache of disks
3941  *      5) disks
3942  *
3943  * Some L2ARC device types exhibit extremely slow write performance.
3944  * To accommodate for this there are some significant differences between
3945  * the L2ARC and traditional cache design:
3946  *
3947  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3948  * the ARC behave as usual, freeing buffers and placing headers on ghost
3949  * lists.  The ARC does not send buffers to the L2ARC during eviction as
3950  * this would add inflated write latencies for all ARC memory pressure.
3951  *
3952  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3953  * It does this by periodically scanning buffers from the eviction-end of
3954  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3955  * not already there.  It scans until a headroom of buffers is satisfied,
3956  * which itself is a buffer for ARC eviction.  The thread that does this is
3957  * l2arc_feed_thread(), illustrated below; example sizes are included to
3958  * provide a better sense of ratio than this diagram:
3959  *
3960  *             head -->                        tail
3961  *              +---------------------+----------+
3962  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3963  *              +---------------------+----------+   |   o L2ARC eligible
3964  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3965  *              +---------------------+----------+   |
3966  *                   15.9 Gbytes      ^ 32 Mbytes    |
3967  *                                 headroom          |
3968  *                                            l2arc_feed_thread()
3969  *                                                   |
3970  *                       l2arc write hand <--[oooo]--'
3971  *                               |           8 Mbyte
3972  *                               |          write max
3973  *                               V
3974  *                +==============================+
3975  *      L2ARC dev |####|#|###|###|    |####| ... |
3976  *                +==============================+
3977  *                           32 Gbytes
3978  *
3979  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3980  * evicted, then the L2ARC has cached a buffer much sooner than it probably
3981  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3982  * safe to say that this is an uncommon case, since buffers at the end of
3983  * the ARC lists have moved there due to inactivity.
3984  *
3985  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3986  * then the L2ARC simply misses copying some buffers.  This serves as a
3987  * pressure valve to prevent heavy read workloads from both stalling the ARC
3988  * with waits and clogging the L2ARC with writes.  This also helps prevent
3989  * the potential for the L2ARC to churn if it attempts to cache content too
3990  * quickly, such as during backups of the entire pool.
3991  *
3992  * 5. After system boot and before the ARC has filled main memory, there are
3993  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3994  * lists can remain mostly static.  Instead of searching from tail of these
3995  * lists as pictured, the l2arc_feed_thread() will search from the list heads
3996  * for eligible buffers, greatly increasing its chance of finding them.
3997  *
3998  * The L2ARC device write speed is also boosted during this time so that
3999  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4000  * there are no L2ARC reads, and no fear of degrading read performance
4001  * through increased writes.
4002  *
4003  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4004  * the vdev queue can aggregate them into larger and fewer writes.  Each
4005  * device is written to in a rotor fashion, sweeping writes through
4006  * available space then repeating.
4007  *
4008  * 7. The L2ARC does not store dirty content.  It never needs to flush
4009  * write buffers back to disk based storage.
4010  *
4011  * 8. If an ARC buffer is written (and dirtied) which also exists in the
4012  * L2ARC, the now stale L2ARC buffer is immediately dropped.
4013  *
4014  * The performance of the L2ARC can be tweaked by a number of tunables, which
4015  * may be necessary for different workloads:
4016  *
4017  *      l2arc_write_max         max write bytes per interval
4018  *      l2arc_write_boost       extra write bytes during device warmup
4019  *      l2arc_noprefetch        skip caching prefetched buffers
4020  *      l2arc_headroom          number of max device writes to precache
4021  *      l2arc_feed_secs         seconds between L2ARC writing
4022  *
4023  * Tunables may be removed or added as future performance improvements are
4024  * integrated, and also may become zpool properties.
4025  *
4026  * There are three key functions that control how the L2ARC warms up:
4027  *
4028  *      l2arc_write_eligible()  check if a buffer is eligible to cache
4029  *      l2arc_write_size()      calculate how much to write
4030  *      l2arc_write_interval()  calculate sleep delay between writes
4031  *
4032  * These three functions determine what to write, how much, and how quickly
4033  * to send writes.
4034  */
4035
4036 static boolean_t
4037 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4038 {
4039         /*
4040          * A buffer is *not* eligible for the L2ARC if it:
4041          * 1. belongs to a different spa.
4042          * 2. is already cached on the L2ARC.
4043          * 3. has an I/O in progress (it may be an incomplete read).
4044          * 4. is flagged not eligible (zfs property).
4045          */
4046         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4047             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4048                 return (B_FALSE);
4049
4050         return (B_TRUE);
4051 }
4052
4053 static uint64_t
4054 l2arc_write_size(l2arc_dev_t *dev)
4055 {
4056         uint64_t size;
4057
4058         size = dev->l2ad_write;
4059
4060         if (arc_warm == B_FALSE)
4061                 size += dev->l2ad_boost;
4062
4063         return (size);
4064
4065 }
4066
4067 static clock_t
4068 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4069 {
4070         clock_t interval, next, now;
4071
4072         /*
4073          * If the ARC lists are busy, increase our write rate; if the
4074          * lists are stale, idle back.  This is achieved by checking
4075          * how much we previously wrote - if it was more than half of
4076          * what we wanted, schedule the next write much sooner.
4077          */
4078         if (l2arc_feed_again && wrote > (wanted / 2))
4079                 interval = (hz * l2arc_feed_min_ms) / 1000;
4080         else
4081                 interval = hz * l2arc_feed_secs;
4082
4083         now = ddi_get_lbolt();
4084         next = MAX(now, MIN(now + interval, began + interval));
4085
4086         return (next);
4087 }
4088
4089 static void
4090 l2arc_hdr_stat_add(void)
4091 {
4092         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4093         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4094 }
4095
4096 static void
4097 l2arc_hdr_stat_remove(void)
4098 {
4099         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4100         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4101 }
4102
4103 /*
4104  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4105  * If a device is returned, this also returns holding the spa config lock.
4106  */
4107 static l2arc_dev_t *
4108 l2arc_dev_get_next(void)
4109 {
4110         l2arc_dev_t *first, *next = NULL;
4111
4112         /*
4113          * Lock out the removal of spas (spa_namespace_lock), then removal
4114          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4115          * both locks will be dropped and a spa config lock held instead.
4116          */
4117         mutex_enter(&spa_namespace_lock);
4118         mutex_enter(&l2arc_dev_mtx);
4119
4120         /* if there are no vdevs, there is nothing to do */
4121         if (l2arc_ndev == 0)
4122                 goto out;
4123
4124         first = NULL;
4125         next = l2arc_dev_last;
4126         do {
4127                 /* loop around the list looking for a non-faulted vdev */
4128                 if (next == NULL) {
4129                         next = list_head(l2arc_dev_list);
4130                 } else {
4131                         next = list_next(l2arc_dev_list, next);
4132                         if (next == NULL)
4133                                 next = list_head(l2arc_dev_list);
4134                 }
4135
4136                 /* if we have come back to the start, bail out */
4137                 if (first == NULL)
4138                         first = next;
4139                 else if (next == first)
4140                         break;
4141
4142         } while (vdev_is_dead(next->l2ad_vdev));
4143
4144         /* if we were unable to find any usable vdevs, return NULL */
4145         if (vdev_is_dead(next->l2ad_vdev))
4146                 next = NULL;
4147
4148         l2arc_dev_last = next;
4149
4150 out:
4151         mutex_exit(&l2arc_dev_mtx);
4152
4153         /*
4154          * Grab the config lock to prevent the 'next' device from being
4155          * removed while we are writing to it.
4156          */
4157         if (next != NULL)
4158                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4159         mutex_exit(&spa_namespace_lock);
4160
4161         return (next);
4162 }
4163
4164 /*
4165  * Free buffers that were tagged for destruction.
4166  */
4167 static void
4168 l2arc_do_free_on_write(void)
4169 {
4170         list_t *buflist;
4171         l2arc_data_free_t *df, *df_prev;
4172
4173         mutex_enter(&l2arc_free_on_write_mtx);
4174         buflist = l2arc_free_on_write;
4175
4176         for (df = list_tail(buflist); df; df = df_prev) {
4177                 df_prev = list_prev(buflist, df);
4178                 ASSERT(df->l2df_data != NULL);
4179                 ASSERT(df->l2df_func != NULL);
4180                 df->l2df_func(df->l2df_data, df->l2df_size);
4181                 list_remove(buflist, df);
4182                 kmem_free(df, sizeof (l2arc_data_free_t));
4183         }
4184
4185         mutex_exit(&l2arc_free_on_write_mtx);
4186 }
4187
4188 /*
4189  * A write to a cache device has completed.  Update all headers to allow
4190  * reads from these buffers to begin.
4191  */
4192 static void
4193 l2arc_write_done(zio_t *zio)
4194 {
4195         l2arc_write_callback_t *cb;
4196         l2arc_dev_t *dev;
4197         list_t *buflist;
4198         arc_buf_hdr_t *head, *ab, *ab_prev;
4199         l2arc_buf_hdr_t *abl2;
4200         kmutex_t *hash_lock;
4201
4202         cb = zio->io_private;
4203         ASSERT(cb != NULL);
4204         dev = cb->l2wcb_dev;
4205         ASSERT(dev != NULL);
4206         head = cb->l2wcb_head;
4207         ASSERT(head != NULL);
4208         buflist = dev->l2ad_buflist;
4209         ASSERT(buflist != NULL);
4210         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4211             l2arc_write_callback_t *, cb);
4212
4213         if (zio->io_error != 0)
4214                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4215
4216         mutex_enter(&l2arc_buflist_mtx);
4217
4218         /*
4219          * All writes completed, or an error was hit.
4220          */
4221         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4222                 ab_prev = list_prev(buflist, ab);
4223
4224                 hash_lock = HDR_LOCK(ab);
4225                 if (!mutex_tryenter(hash_lock)) {
4226                         /*
4227                          * This buffer misses out.  It may be in a stage
4228                          * of eviction.  Its ARC_L2_WRITING flag will be
4229                          * left set, denying reads to this buffer.
4230                          */
4231                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4232                         continue;
4233                 }
4234
4235                 if (zio->io_error != 0) {
4236                         /*
4237                          * Error - drop L2ARC entry.
4238                          */
4239                         list_remove(buflist, ab);
4240                         abl2 = ab->b_l2hdr;
4241                         ab->b_l2hdr = NULL;
4242                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4243                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4244                 }
4245
4246                 /*
4247                  * Allow ARC to begin reads to this L2ARC entry.
4248                  */
4249                 ab->b_flags &= ~ARC_L2_WRITING;
4250
4251                 mutex_exit(hash_lock);
4252         }
4253
4254         atomic_inc_64(&l2arc_writes_done);
4255         list_remove(buflist, head);
4256         kmem_cache_free(hdr_cache, head);
4257         mutex_exit(&l2arc_buflist_mtx);
4258
4259         l2arc_do_free_on_write();
4260
4261         kmem_free(cb, sizeof (l2arc_write_callback_t));
4262 }
4263
4264 /*
4265  * A read to a cache device completed.  Validate buffer contents before
4266  * handing over to the regular ARC routines.
4267  */
4268 static void
4269 l2arc_read_done(zio_t *zio)
4270 {
4271         l2arc_read_callback_t *cb;
4272         arc_buf_hdr_t *hdr;
4273         arc_buf_t *buf;
4274         kmutex_t *hash_lock;
4275         int equal;
4276
4277         ASSERT(zio->io_vd != NULL);
4278         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4279
4280         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4281
4282         cb = zio->io_private;
4283         ASSERT(cb != NULL);
4284         buf = cb->l2rcb_buf;
4285         ASSERT(buf != NULL);
4286
4287         hash_lock = HDR_LOCK(buf->b_hdr);
4288         mutex_enter(hash_lock);
4289         hdr = buf->b_hdr;
4290         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4291
4292         /*
4293          * Check this survived the L2ARC journey.
4294          */
4295         equal = arc_cksum_equal(buf);
4296         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4297                 mutex_exit(hash_lock);
4298                 zio->io_private = buf;
4299                 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4300                 zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
4301                 arc_read_done(zio);
4302         } else {
4303                 mutex_exit(hash_lock);
4304                 /*
4305                  * Buffer didn't survive caching.  Increment stats and
4306                  * reissue to the original storage device.
4307                  */
4308                 if (zio->io_error != 0) {
4309                         ARCSTAT_BUMP(arcstat_l2_io_error);
4310                 } else {
4311                         zio->io_error = EIO;
4312                 }
4313                 if (!equal)
4314                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4315
4316                 /*
4317                  * If there's no waiter, issue an async i/o to the primary
4318                  * storage now.  If there *is* a waiter, the caller must
4319                  * issue the i/o in a context where it's OK to block.
4320                  */
4321                 if (zio->io_waiter == NULL) {
4322                         zio_t *pio = zio_unique_parent(zio);
4323
4324                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4325
4326                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4327                             buf->b_data, zio->io_size, arc_read_done, buf,
4328                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4329                 }
4330         }
4331
4332         kmem_free(cb, sizeof (l2arc_read_callback_t));
4333 }
4334
4335 /*
4336  * This is the list priority from which the L2ARC will search for pages to
4337  * cache.  This is used within loops (0..3) to cycle through lists in the
4338  * desired order.  This order can have a significant effect on cache
4339  * performance.
4340  *
4341  * Currently the metadata lists are hit first, MFU then MRU, followed by
4342  * the data lists.  This function returns a locked list, and also returns
4343  * the lock pointer.
4344  */
4345 static list_t *
4346 l2arc_list_locked(int list_num, kmutex_t **lock)
4347 {
4348         list_t *list = NULL;
4349
4350         ASSERT(list_num >= 0 && list_num <= 3);
4351
4352         switch (list_num) {
4353         case 0:
4354                 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4355                 *lock = &arc_mfu->arcs_mtx;
4356                 break;
4357         case 1:
4358                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4359                 *lock = &arc_mru->arcs_mtx;
4360                 break;
4361         case 2:
4362                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4363                 *lock = &arc_mfu->arcs_mtx;
4364                 break;
4365         case 3:
4366                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4367                 *lock = &arc_mru->arcs_mtx;
4368                 break;
4369         }
4370
4371         ASSERT(!(MUTEX_HELD(*lock)));
4372         mutex_enter(*lock);
4373         return (list);
4374 }
4375
4376 /*
4377  * Evict buffers from the device write hand to the distance specified in
4378  * bytes.  This distance may span populated buffers, it may span nothing.
4379  * This is clearing a region on the L2ARC device ready for writing.
4380  * If the 'all' boolean is set, every buffer is evicted.
4381  */
4382 static void
4383 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4384 {
4385         list_t *buflist;
4386         l2arc_buf_hdr_t *abl2;
4387         arc_buf_hdr_t *ab, *ab_prev;
4388         kmutex_t *hash_lock;
4389         uint64_t taddr;
4390
4391         buflist = dev->l2ad_buflist;
4392
4393         if (buflist == NULL)
4394                 return;
4395
4396         if (!all && dev->l2ad_first) {
4397                 /*
4398                  * This is the first sweep through the device.  There is
4399                  * nothing to evict.
4400                  */
4401                 return;
4402         }
4403
4404         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4405                 /*
4406                  * When nearing the end of the device, evict to the end
4407                  * before the device write hand jumps to the start.
4408                  */
4409                 taddr = dev->l2ad_end;
4410         } else {
4411                 taddr = dev->l2ad_hand + distance;
4412         }
4413         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4414             uint64_t, taddr, boolean_t, all);
4415
4416 top:
4417         mutex_enter(&l2arc_buflist_mtx);
4418         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4419                 ab_prev = list_prev(buflist, ab);
4420
4421                 hash_lock = HDR_LOCK(ab);
4422                 if (!mutex_tryenter(hash_lock)) {
4423                         /*
4424                          * Missed the hash lock.  Retry.
4425                          */
4426                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4427                         mutex_exit(&l2arc_buflist_mtx);
4428                         mutex_enter(hash_lock);
4429                         mutex_exit(hash_lock);
4430                         goto top;
4431                 }
4432
4433                 if (HDR_L2_WRITE_HEAD(ab)) {
4434                         /*
4435                          * We hit a write head node.  Leave it for
4436                          * l2arc_write_done().
4437                          */
4438                         list_remove(buflist, ab);
4439                         mutex_exit(hash_lock);
4440                         continue;
4441                 }
4442
4443                 if (!all && ab->b_l2hdr != NULL &&
4444                     (ab->b_l2hdr->b_daddr > taddr ||
4445                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4446                         /*
4447                          * We've evicted to the target address,
4448                          * or the end of the device.
4449                          */
4450                         mutex_exit(hash_lock);
4451                         break;
4452                 }
4453
4454                 if (HDR_FREE_IN_PROGRESS(ab)) {
4455                         /*
4456                          * Already on the path to destruction.
4457                          */
4458                         mutex_exit(hash_lock);
4459                         continue;
4460                 }
4461
4462                 if (ab->b_state == arc_l2c_only) {
4463                         ASSERT(!HDR_L2_READING(ab));
4464                         /*
4465                          * This doesn't exist in the ARC.  Destroy.
4466                          * arc_hdr_destroy() will call list_remove()
4467                          * and decrement arcstat_l2_size.
4468                          */
4469                         arc_change_state(arc_anon, ab, hash_lock);
4470                         arc_hdr_destroy(ab);
4471                 } else {
4472                         /*
4473                          * Invalidate issued or about to be issued
4474                          * reads, since we may be about to write
4475                          * over this location.
4476                          */
4477                         if (HDR_L2_READING(ab)) {
4478                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4479                                 ab->b_flags |= ARC_L2_EVICTED;
4480                         }
4481
4482                         /*
4483                          * Tell ARC this no longer exists in L2ARC.
4484                          */
4485                         if (ab->b_l2hdr != NULL) {
4486                                 abl2 = ab->b_l2hdr;
4487                                 ab->b_l2hdr = NULL;
4488                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4489                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4490                         }
4491                         list_remove(buflist, ab);
4492
4493                         /*
4494                          * This may have been leftover after a
4495                          * failed write.
4496                          */
4497                         ab->b_flags &= ~ARC_L2_WRITING;
4498                 }
4499                 mutex_exit(hash_lock);
4500         }
4501         mutex_exit(&l2arc_buflist_mtx);
4502
4503         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4504         dev->l2ad_evict = taddr;
4505 }
4506
4507 /*
4508  * Find and write ARC buffers to the L2ARC device.
4509  *
4510  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4511  * for reading until they have completed writing.
4512  */
4513 static uint64_t
4514 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4515 {
4516         arc_buf_hdr_t *ab, *ab_prev, *head;
4517         l2arc_buf_hdr_t *hdrl2;
4518         list_t *list;
4519         uint64_t passed_sz, write_sz, buf_sz, headroom;
4520         void *buf_data;
4521         kmutex_t *hash_lock, *list_lock = NULL;
4522         boolean_t have_lock, full;
4523         l2arc_write_callback_t *cb;
4524         zio_t *pio, *wzio;
4525         uint64_t guid = spa_guid(spa);
4526         int try;
4527
4528         ASSERT(dev->l2ad_vdev != NULL);
4529
4530         pio = NULL;
4531         write_sz = 0;
4532         full = B_FALSE;
4533         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4534         head->b_flags |= ARC_L2_WRITE_HEAD;
4535
4536         /*
4537          * Copy buffers for L2ARC writing.
4538          */
4539         mutex_enter(&l2arc_buflist_mtx);
4540         for (try = 0; try <= 3; try++) {
4541                 list = l2arc_list_locked(try, &list_lock);
4542                 passed_sz = 0;
4543
4544                 /*
4545                  * L2ARC fast warmup.
4546                  *
4547                  * Until the ARC is warm and starts to evict, read from the
4548                  * head of the ARC lists rather than the tail.
4549                  */
4550                 headroom = target_sz * l2arc_headroom;
4551                 if (arc_warm == B_FALSE)
4552                         ab = list_head(list);
4553                 else
4554                         ab = list_tail(list);
4555
4556                 for (; ab; ab = ab_prev) {
4557                         if (arc_warm == B_FALSE)
4558                                 ab_prev = list_next(list, ab);
4559                         else
4560                                 ab_prev = list_prev(list, ab);
4561
4562                         hash_lock = HDR_LOCK(ab);
4563                         have_lock = MUTEX_HELD(hash_lock);
4564                         if (!have_lock && !mutex_tryenter(hash_lock)) {
4565                                 /*
4566                                  * Skip this buffer rather than waiting.
4567                                  */
4568                                 continue;
4569                         }
4570
4571                         passed_sz += ab->b_size;
4572                         if (passed_sz > headroom) {
4573                                 /*
4574                                  * Searched too far.
4575                                  */
4576                                 mutex_exit(hash_lock);
4577                                 break;
4578                         }
4579
4580                         if (!l2arc_write_eligible(guid, ab)) {
4581                                 mutex_exit(hash_lock);
4582                                 continue;
4583                         }
4584
4585                         if ((write_sz + ab->b_size) > target_sz) {
4586                                 full = B_TRUE;
4587                                 mutex_exit(hash_lock);
4588                                 break;
4589                         }
4590
4591                         if (pio == NULL) {
4592                                 /*
4593                                  * Insert a dummy header on the buflist so
4594                                  * l2arc_write_done() can find where the
4595                                  * write buffers begin without searching.
4596                                  */
4597                                 list_insert_head(dev->l2ad_buflist, head);
4598
4599                                 cb = kmem_alloc(sizeof (l2arc_write_callback_t),
4600                                                 KM_PUSHPAGE);
4601                                 cb->l2wcb_dev = dev;
4602                                 cb->l2wcb_head = head;
4603                                 pio = zio_root(spa, l2arc_write_done, cb,
4604                                     ZIO_FLAG_CANFAIL);
4605                         }
4606
4607                         /*
4608                          * Create and add a new L2ARC header.
4609                          */
4610                         hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
4611                                             KM_PUSHPAGE);
4612                         hdrl2->b_dev = dev;
4613                         hdrl2->b_daddr = dev->l2ad_hand;
4614
4615                         ab->b_flags |= ARC_L2_WRITING;
4616                         ab->b_l2hdr = hdrl2;
4617                         list_insert_head(dev->l2ad_buflist, ab);
4618                         buf_data = ab->b_buf->b_data;
4619                         buf_sz = ab->b_size;
4620
4621                         /*
4622                          * Compute and store the buffer cksum before
4623                          * writing.  On debug the cksum is verified first.
4624                          */
4625                         arc_cksum_verify(ab->b_buf);
4626                         arc_cksum_compute(ab->b_buf, B_TRUE);
4627
4628                         mutex_exit(hash_lock);
4629
4630                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4631                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4632                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4633                             ZIO_FLAG_CANFAIL, B_FALSE);
4634
4635                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4636                             zio_t *, wzio);
4637                         (void) zio_nowait(wzio);
4638
4639                         /*
4640                          * Keep the clock hand suitably device-aligned.
4641                          */
4642                         buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4643
4644                         write_sz += buf_sz;
4645                         dev->l2ad_hand += buf_sz;
4646                 }
4647
4648                 mutex_exit(list_lock);
4649
4650                 if (full == B_TRUE)
4651                         break;
4652         }
4653         mutex_exit(&l2arc_buflist_mtx);
4654
4655         if (pio == NULL) {
4656                 ASSERT3U(write_sz, ==, 0);
4657                 kmem_cache_free(hdr_cache, head);
4658                 return (0);
4659         }
4660
4661         ASSERT3U(write_sz, <=, target_sz);
4662         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4663         ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
4664         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4665         vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
4666
4667         /*
4668          * Bump device hand to the device start if it is approaching the end.
4669          * l2arc_evict() will already have evicted ahead for this case.
4670          */
4671         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4672                 vdev_space_update(dev->l2ad_vdev,
4673                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
4674                 dev->l2ad_hand = dev->l2ad_start;
4675                 dev->l2ad_evict = dev->l2ad_start;
4676                 dev->l2ad_first = B_FALSE;
4677         }
4678
4679         dev->l2ad_writing = B_TRUE;
4680         (void) zio_wait(pio);
4681         dev->l2ad_writing = B_FALSE;
4682
4683         return (write_sz);
4684 }
4685
4686 /*
4687  * This thread feeds the L2ARC at regular intervals.  This is the beating
4688  * heart of the L2ARC.
4689  */
4690 static void
4691 l2arc_feed_thread(void)
4692 {
4693         callb_cpr_t cpr;
4694         l2arc_dev_t *dev;
4695         spa_t *spa;
4696         uint64_t size, wrote;
4697         clock_t begin, next = ddi_get_lbolt();
4698
4699         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4700
4701         mutex_enter(&l2arc_feed_thr_lock);
4702
4703         while (l2arc_thread_exit == 0) {
4704                 CALLB_CPR_SAFE_BEGIN(&cpr);
4705                 (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
4706                     &l2arc_feed_thr_lock, next);
4707                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4708                 next = ddi_get_lbolt() + hz;
4709
4710                 /*
4711                  * Quick check for L2ARC devices.
4712                  */
4713                 mutex_enter(&l2arc_dev_mtx);
4714                 if (l2arc_ndev == 0) {
4715                         mutex_exit(&l2arc_dev_mtx);
4716                         continue;
4717                 }
4718                 mutex_exit(&l2arc_dev_mtx);
4719                 begin = ddi_get_lbolt();
4720
4721                 /*
4722                  * This selects the next l2arc device to write to, and in
4723                  * doing so the next spa to feed from: dev->l2ad_spa.   This
4724                  * will return NULL if there are now no l2arc devices or if
4725                  * they are all faulted.
4726                  *
4727                  * If a device is returned, its spa's config lock is also
4728                  * held to prevent device removal.  l2arc_dev_get_next()
4729                  * will grab and release l2arc_dev_mtx.
4730                  */
4731                 if ((dev = l2arc_dev_get_next()) == NULL)
4732                         continue;
4733
4734                 spa = dev->l2ad_spa;
4735                 ASSERT(spa != NULL);
4736
4737                 /*
4738                  * If the pool is read-only then force the feed thread to
4739                  * sleep a little longer.
4740                  */
4741                 if (!spa_writeable(spa)) {
4742                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4743                         spa_config_exit(spa, SCL_L2ARC, dev);
4744                         continue;
4745                 }
4746
4747                 /*
4748                  * Avoid contributing to memory pressure.
4749                  */
4750                 if (arc_no_grow) {
4751                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4752                         spa_config_exit(spa, SCL_L2ARC, dev);
4753                         continue;
4754                 }
4755
4756                 ARCSTAT_BUMP(arcstat_l2_feeds);
4757
4758                 size = l2arc_write_size(dev);
4759
4760                 /*
4761                  * Evict L2ARC buffers that will be overwritten.
4762                  */
4763                 l2arc_evict(dev, size, B_FALSE);
4764
4765                 /*
4766                  * Write ARC buffers.
4767                  */
4768                 wrote = l2arc_write_buffers(spa, dev, size);
4769
4770                 /*
4771                  * Calculate interval between writes.
4772                  */
4773                 next = l2arc_write_interval(begin, size, wrote);
4774                 spa_config_exit(spa, SCL_L2ARC, dev);
4775         }
4776
4777         l2arc_thread_exit = 0;
4778         cv_broadcast(&l2arc_feed_thr_cv);
4779         CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
4780         thread_exit();
4781 }
4782
4783 boolean_t
4784 l2arc_vdev_present(vdev_t *vd)
4785 {
4786         l2arc_dev_t *dev;
4787
4788         mutex_enter(&l2arc_dev_mtx);
4789         for (dev = list_head(l2arc_dev_list); dev != NULL;
4790             dev = list_next(l2arc_dev_list, dev)) {
4791                 if (dev->l2ad_vdev == vd)
4792                         break;
4793         }
4794         mutex_exit(&l2arc_dev_mtx);
4795
4796         return (dev != NULL);
4797 }
4798
4799 /*
4800  * Add a vdev for use by the L2ARC.  By this point the spa has already
4801  * validated the vdev and opened it.
4802  */
4803 void
4804 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
4805 {
4806         l2arc_dev_t *adddev;
4807
4808         ASSERT(!l2arc_vdev_present(vd));
4809
4810         /*
4811          * Create a new l2arc device entry.
4812          */
4813         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4814         adddev->l2ad_spa = spa;
4815         adddev->l2ad_vdev = vd;
4816         adddev->l2ad_write = l2arc_write_max;
4817         adddev->l2ad_boost = l2arc_write_boost;
4818         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
4819         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
4820         adddev->l2ad_hand = adddev->l2ad_start;
4821         adddev->l2ad_evict = adddev->l2ad_start;
4822         adddev->l2ad_first = B_TRUE;
4823         adddev->l2ad_writing = B_FALSE;
4824         list_link_init(&adddev->l2ad_node);
4825         ASSERT3U(adddev->l2ad_write, >, 0);
4826
4827         /*
4828          * This is a list of all ARC buffers that are still valid on the
4829          * device.
4830          */
4831         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4832         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4833             offsetof(arc_buf_hdr_t, b_l2node));
4834
4835         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
4836
4837         /*
4838          * Add device to global list
4839          */
4840         mutex_enter(&l2arc_dev_mtx);
4841         list_insert_head(l2arc_dev_list, adddev);
4842         atomic_inc_64(&l2arc_ndev);
4843         mutex_exit(&l2arc_dev_mtx);
4844 }
4845
4846 /*
4847  * Remove a vdev from the L2ARC.
4848  */
4849 void
4850 l2arc_remove_vdev(vdev_t *vd)
4851 {
4852         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4853
4854         /*
4855          * Find the device by vdev
4856          */
4857         mutex_enter(&l2arc_dev_mtx);
4858         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4859                 nextdev = list_next(l2arc_dev_list, dev);
4860                 if (vd == dev->l2ad_vdev) {
4861                         remdev = dev;
4862                         break;
4863                 }
4864         }
4865         ASSERT(remdev != NULL);
4866
4867         /*
4868          * Remove device from global list
4869          */
4870         list_remove(l2arc_dev_list, remdev);
4871         l2arc_dev_last = NULL;          /* may have been invalidated */
4872         atomic_dec_64(&l2arc_ndev);
4873         mutex_exit(&l2arc_dev_mtx);
4874
4875         /*
4876          * Clear all buflists and ARC references.  L2ARC device flush.
4877          */
4878         l2arc_evict(remdev, 0, B_TRUE);
4879         list_destroy(remdev->l2ad_buflist);
4880         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4881         kmem_free(remdev, sizeof (l2arc_dev_t));
4882 }
4883
4884 void
4885 l2arc_init(void)
4886 {
4887         l2arc_thread_exit = 0;
4888         l2arc_ndev = 0;
4889         l2arc_writes_sent = 0;
4890         l2arc_writes_done = 0;
4891
4892         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4893         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4894         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4895         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4896         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4897
4898         l2arc_dev_list = &L2ARC_dev_list;
4899         l2arc_free_on_write = &L2ARC_free_on_write;
4900         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4901             offsetof(l2arc_dev_t, l2ad_node));
4902         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4903             offsetof(l2arc_data_free_t, l2df_list_node));
4904 }
4905
4906 void
4907 l2arc_fini(void)
4908 {
4909         /*
4910          * This is called from dmu_fini(), which is called from spa_fini();
4911          * Because of this, we can assume that all l2arc devices have
4912          * already been removed when the pools themselves were removed.
4913          */
4914
4915         l2arc_do_free_on_write();
4916
4917         mutex_destroy(&l2arc_feed_thr_lock);
4918         cv_destroy(&l2arc_feed_thr_cv);
4919         mutex_destroy(&l2arc_dev_mtx);
4920         mutex_destroy(&l2arc_buflist_mtx);
4921         mutex_destroy(&l2arc_free_on_write_mtx);
4922
4923         list_destroy(l2arc_dev_list);
4924         list_destroy(l2arc_free_on_write);
4925 }
4926
4927 void
4928 l2arc_start(void)
4929 {
4930         if (!(spa_mode_global & FWRITE))
4931                 return;
4932
4933         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4934             TS_RUN, minclsyspri);
4935 }
4936
4937 void
4938 l2arc_stop(void)
4939 {
4940         if (!(spa_mode_global & FWRITE))
4941                 return;
4942
4943         mutex_enter(&l2arc_feed_thr_lock);
4944         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
4945         l2arc_thread_exit = 1;
4946         while (l2arc_thread_exit != 0)
4947                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4948         mutex_exit(&l2arc_feed_thr_lock);
4949 }
4950
4951 #if defined(_KERNEL) && defined(HAVE_SPL)
4952 EXPORT_SYMBOL(arc_read);
4953 EXPORT_SYMBOL(arc_buf_remove_ref);
4954 EXPORT_SYMBOL(arc_getbuf_func);
4955 EXPORT_SYMBOL(arc_add_prune_callback);
4956 EXPORT_SYMBOL(arc_remove_prune_callback);
4957
4958 module_param(zfs_arc_min, ulong, 0444);
4959 MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
4960
4961 module_param(zfs_arc_max, ulong, 0444);
4962 MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
4963
4964 module_param(zfs_arc_meta_limit, ulong, 0444);
4965 MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
4966
4967 module_param(zfs_arc_meta_prune, int, 0444);
4968 MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
4969
4970 module_param(zfs_arc_grow_retry, int, 0444);
4971 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
4972
4973 module_param(zfs_arc_shrink_shift, int, 0444);
4974 MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
4975
4976 module_param(zfs_arc_p_min_shift, int, 0444);
4977 MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
4978
4979 module_param(l2arc_write_max, ulong, 0444);
4980 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
4981
4982 module_param(l2arc_write_boost, ulong, 0444);
4983 MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
4984
4985 module_param(l2arc_headroom, ulong, 0444);
4986 MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
4987
4988 module_param(l2arc_feed_secs, ulong, 0444);
4989 MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
4990
4991 module_param(l2arc_feed_min_ms, ulong, 0444);
4992 MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
4993
4994 module_param(l2arc_noprefetch, int, 0444);
4995 MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
4996
4997 module_param(l2arc_feed_again, int, 0444);
4998 MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
4999
5000 module_param(l2arc_norw, int, 0444);
5001 MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
5002
5003 #endif