X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Farc.c;h=96c3fb702ba74f33b75ba930ab958e2e6077e560;hb=1fde1e37208c2f56c72c70a06676676f04b65998;hp=73aecb2852d5282d1c1c9551ee0e51ba50a612b4;hpb=172bb4bd5e4afef721dd4d2972d8680d983f144b;p=zfs.git diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 73aecb2..96c3fb7 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -119,11 +118,11 @@ #include #include -#include #include #include #include #include +#include #ifdef _KERNEL #include #include @@ -132,6 +131,7 @@ #endif #include #include +#include static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ @@ -152,6 +152,12 @@ typedef enum arc_reclaim_strategy { /* number of seconds before growing cache again */ static int arc_grow_retry = 60; +/* shift of arc_c for calculating both min and max arc_p */ +static int arc_p_min_shift = 4; + +/* log2(fraction of arc to reclaim) */ +static int arc_shrink_shift = 5; + /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) @@ -171,7 +177,9 @@ static boolean_t arc_warm; uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; -int zfs_mdcomp_disable = 0; +int zfs_arc_grow_retry = 0; +int zfs_arc_shrink_shift = 0; +int zfs_arc_p_min_shift = 0; /* * Note that buffers can be in one of 6 states: @@ -239,6 +247,9 @@ typedef struct arc_stats { kstat_named_t arcstat_recycle_miss; kstat_named_t arcstat_mutex_miss; kstat_named_t arcstat_evict_skip; + kstat_named_t arcstat_evict_l2_cached; + kstat_named_t arcstat_evict_l2_eligible; + kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; @@ -250,10 +261,14 @@ typedef struct arc_stats { kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; kstat_named_t arcstat_hdr_size; + kstat_named_t arcstat_data_size; + kstat_named_t arcstat_other_size; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_read_bytes; + kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; @@ -288,6 +303,9 @@ static arc_stats_t arc_stats = { { "recycle_miss", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, + { "evict_l2_cached", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible", KSTAT_DATA_UINT64 }, + { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, @@ -299,10 +317,14 @@ static arc_stats_t arc_stats = { { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, + { "data_size", KSTAT_DATA_UINT64 }, + { "other_size", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, + { "l2_read_bytes", KSTAT_DATA_UINT64 }, + { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, @@ -323,7 +345,7 @@ static arc_stats_t arc_stats = { #define ARCSTAT_INCR(stat, val) \ atomic_add_64(&arc_stats.stat.value.ui64, (val)); -#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define ARCSTAT_MAX(stat, val) { \ @@ -357,7 +379,7 @@ static arc_stats_t arc_stats = { } kstat_t *arc_ksp; -static arc_state_t *arc_anon; +static arc_state_t *arc_anon; static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; @@ -380,6 +402,7 @@ static arc_state_t *arc_l2c_only; static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; +static uint64_t arc_loaned_bytes; static uint64_t arc_meta_used; static uint64_t arc_meta_limit; static uint64_t arc_meta_max = 0; @@ -413,6 +436,7 @@ struct arc_buf_hdr { kmutex_t b_freeze_lock; zio_cksum_t *b_freeze_cksum; + void *b_thawed; arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; @@ -425,7 +449,7 @@ struct arc_buf_hdr { /* immutable */ arc_buf_contents_t b_type; uint64_t b_size; - spa_t *b_spa; + uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; @@ -447,7 +471,9 @@ static arc_buf_hdr_t arc_eviction_hdr; static void arc_get_data_buf(arc_buf_t *buf); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); static int arc_evict_needed(arc_buf_contents_t type); -static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); +static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); + +static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ @@ -471,11 +497,11 @@ static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ -#define ARC_STORED (1 << 19) /* has been store()d to */ #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) +#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) @@ -519,8 +545,8 @@ static buf_hash_table_t buf_hash_table; (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) -#define HDR_LOCK(buf) \ - (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) +#define HDR_LOCK(hdr) \ + (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; @@ -529,8 +555,9 @@ uint64_t zfs_crc64_table[256]; */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ -#define L2ARC_HEADROOM 4 /* num of writes */ -#define L2ARC_FEED_SECS 1 /* caching interval */ +#define L2ARC_HEADROOM 2 /* num of writes */ +#define L2ARC_FEED_SECS 1 /* caching interval secs */ +#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -542,7 +569,10 @@ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ +boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ /* * L2ARC Internals @@ -557,6 +587,7 @@ typedef struct l2arc_dev { uint64_t l2ad_end; /* last addr on device */ uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ list_t *l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ } l2arc_dev_t; @@ -587,7 +618,7 @@ typedef struct l2arc_write_callback { struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ - daddr_t b_daddr; /* disk address, offset byte */ + uint64_t b_daddr; /* disk address, offset byte */ }; typedef struct l2arc_data_free { @@ -607,9 +638,8 @@ static void l2arc_hdr_stat_add(void); static void l2arc_hdr_stat_remove(void); static uint64_t -buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) +buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { - uintptr_t spav = (uintptr_t)spa; uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i; @@ -619,7 +649,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) for (i = 0; i < sizeof (dva_t); i++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; - crc ^= (spav>>8) ^ birth; + crc ^= (spa>>8) ^ birth; return (crc); } @@ -634,8 +664,17 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((buf)->b_birth == birth) && ((buf)->b_spa == spa) +static void +buf_discard_identity(arc_buf_hdr_t *hdr) +{ + hdr->b_dva.dva_word[0] = 0; + hdr->b_dva.dva_word[1] = 0; + hdr->b_birth = 0; + hdr->b_cksum0 = 0; +} + static arc_buf_hdr_t * -buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) +buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); @@ -755,8 +794,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag) refcount_create(&buf->b_refcnt); cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); - ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); return (0); } @@ -767,7 +806,10 @@ buf_cons(void *vbuf, void *unused, int kmflag) arc_buf_t *buf = vbuf; bzero(buf, sizeof (arc_buf_t)); - rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); + return (0); } @@ -781,11 +823,11 @@ hdr_dest(void *vbuf, void *unused) { arc_buf_hdr_t *buf = vbuf; + ASSERT(BUF_EMPTY(buf)); refcount_destroy(&buf->b_refcnt); cv_destroy(&buf->b_cv); mutex_destroy(&buf->b_freeze_lock); - - ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); + arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); } /* ARGSUSED */ @@ -794,7 +836,9 @@ buf_dest(void *vbuf, void *unused) { arc_buf_t *buf = vbuf; - rw_destroy(&buf->b_lock); + mutex_destroy(&buf->b_evict_lock); + rw_destroy(&buf->b_data_lock); + arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } /* @@ -921,18 +965,31 @@ arc_buf_thaw(arc_buf_t *buf) kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } + + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (buf->b_hdr->b_thawed) + kmem_free(buf->b_hdr->b_thawed, 1); + buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); + } + mutex_exit(&buf->b_hdr->b_freeze_lock); } void arc_buf_freeze(arc_buf_t *buf) { + kmutex_t *hash_lock; + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + hash_lock = HDR_LOCK(buf->b_hdr); + mutex_enter(hash_lock); + ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); + mutex_exit(hash_lock); } static void @@ -1004,6 +1061,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT(new_state != old_state); ASSERT(refcnt == 0 || ab->b_datacnt > 0); ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); + ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); from_delta = to_delta = ab->b_datacnt * ab->b_size; @@ -1024,7 +1082,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) /* * If prefetching out of the ghost cache, - * we will have a non-null datacnt. + * we will have a non-zero datacnt. */ if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { /* ghost elements have a ghost size */ @@ -1060,9 +1118,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon) { + if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) buf_hash_remove(ab); - } /* adjust state sizes */ if (to_delta) @@ -1081,15 +1138,53 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } void -arc_space_consume(uint64_t space) +arc_space_consume(uint64_t space, arc_space_type_t type) { + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + default: + break; + case ARC_SPACE_DATA: + ARCSTAT_INCR(arcstat_data_size, space); + break; + case ARC_SPACE_OTHER: + ARCSTAT_INCR(arcstat_other_size, space); + break; + case ARC_SPACE_HDRS: + ARCSTAT_INCR(arcstat_hdr_size, space); + break; + case ARC_SPACE_L2HDRS: + ARCSTAT_INCR(arcstat_l2_hdr_size, space); + break; + } + atomic_add_64(&arc_meta_used, space); atomic_add_64(&arc_size, space); } void -arc_space_return(uint64_t space) +arc_space_return(uint64_t space, arc_space_type_t type) { + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + default: + break; + case ARC_SPACE_DATA: + ARCSTAT_INCR(arcstat_data_size, -space); + break; + case ARC_SPACE_OTHER: + ARCSTAT_INCR(arcstat_other_size, -space); + break; + case ARC_SPACE_HDRS: + ARCSTAT_INCR(arcstat_hdr_size, -space); + break; + case ARC_SPACE_L2HDRS: + ARCSTAT_INCR(arcstat_l2_hdr_size, -space); + break; + } + ASSERT(arc_meta_used >= space); if (arc_meta_max < arc_meta_used) arc_meta_max = arc_meta_used; @@ -1126,7 +1221,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; - hdr->b_spa = spa; + hdr->b_spa = spa_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); @@ -1145,6 +1240,56 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) return (buf); } +static char *arc_onloan_tag = "onloan"; + +/* + * Loan out an anonymous arc buffer. Loaned buffers are not counted as in + * flight data by arc_tempreserve_space() until they are "returned". Loaned + * buffers must be returned to the arc before they can be used by the DMU or + * freed. + */ +arc_buf_t * +arc_loan_buf(spa_t *spa, int size) +{ + arc_buf_t *buf; + + buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + + atomic_add_64(&arc_loaned_bytes, size); + return (buf); +} + +/* + * Return a loaned arc buffer to the arc. + */ +void +arc_return_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(buf->b_data != NULL); + (void) refcount_add(&hdr->b_refcnt, tag); + (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); + + atomic_add_64(&arc_loaned_bytes, -hdr->b_size); +} + +/* Detach an arc_buf from a dbuf (tag) */ +void +arc_loan_inuse_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr; + + ASSERT(buf->b_data != NULL); + hdr = buf->b_hdr; + (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_refcnt, tag); + buf->b_efunc = NULL; + buf->b_private = NULL; + + atomic_add_64(&arc_loaned_bytes, hdr->b_size); +} + static arc_buf_t * arc_buf_clone(arc_buf_t *from) { @@ -1152,6 +1297,8 @@ arc_buf_clone(arc_buf_t *from) arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; + ASSERT(hdr->b_state != arc_anon); + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; @@ -1176,19 +1323,20 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) * must verify b_data != NULL to know if the add_ref * was successful. */ - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); if (buf->b_data == NULL) { - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return; } - hdr = buf->b_hdr; - ASSERT(hdr != NULL); - hash_lock = HDR_LOCK(hdr); + hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); - rw_exit(&buf->b_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + mutex_exit(&buf->b_evict_lock); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); add_reference(hdr, hash_lock, tag); + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); @@ -1232,15 +1380,17 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) arc_buf_contents_t type = buf->b_hdr->b_type; arc_cksum_verify(buf); + if (!recycle) { if (type == ARC_BUFC_METADATA) { arc_buf_data_free(buf->b_hdr, zio_buf_free, buf->b_data, size); - arc_space_return(size); + arc_space_return(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); arc_buf_data_free(buf->b_hdr, zio_data_buf_free, buf->b_data, size); + ARCSTAT_INCR(arcstat_data_size, -size); atomic_add_64(&arc_size, -size); } } @@ -1268,6 +1418,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) continue; *bufp = buf->b_next; + buf->b_next = NULL; ASSERT(buf->b_efunc == NULL); @@ -1279,58 +1430,59 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { + l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; + ASSERT(refcount_is_zero(&hdr->b_refcnt)); ASSERT3P(hdr->b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(!(hdr->b_flags & ARC_STORED)); - if (hdr->b_l2hdr != NULL) { - if (!MUTEX_HELD(&l2arc_buflist_mtx)) { - /* - * To prevent arc_free() and l2arc_evict() from - * attempting to free the same buffer at the same time, - * a FREE_IN_PROGRESS flag is given to arc_free() to - * give it priority. l2arc_evict() can't destroy this - * header while we are waiting on l2arc_buflist_mtx. - * - * The hdr may be removed from l2ad_buflist before we - * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. - */ + if (l2hdr != NULL) { + boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); + /* + * To prevent arc_free() and l2arc_evict() from + * attempting to free the same buffer at the same time, + * a FREE_IN_PROGRESS flag is given to arc_free() to + * give it priority. l2arc_evict() can't destroy this + * header while we are waiting on l2arc_buflist_mtx. + * + * The hdr may be removed from l2ad_buflist before we + * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. + */ + if (!buflist_held) { mutex_enter(&l2arc_buflist_mtx); - if (hdr->b_l2hdr != NULL) { - list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, - hdr); - } - mutex_exit(&l2arc_buflist_mtx); - } else { - list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); + l2hdr = hdr->b_l2hdr; } - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t)); - if (hdr->b_state == arc_l2c_only) - l2arc_hdr_stat_remove(); - hdr->b_l2hdr = NULL; + + if (l2hdr != NULL) { + list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + if (hdr->b_state == arc_l2c_only) + l2arc_hdr_stat_remove(); + hdr->b_l2hdr = NULL; + } + + if (!buflist_held) + mutex_exit(&l2arc_buflist_mtx); } if (!BUF_EMPTY(hdr)) { ASSERT(!HDR_IN_HASH_TABLE(hdr)); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; + buf_discard_identity(hdr); } while (hdr->b_buf) { arc_buf_t *buf = hdr->b_buf; if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); - rw_enter(&buf->b_lock, RW_WRITER); + mutex_enter(&buf->b_evict_lock); ASSERT(buf->b_hdr != NULL); arc_buf_destroy(hdr->b_buf, FALSE, FALSE); hdr->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); mutex_exit(&arc_eviction_mtx); } else { arc_buf_destroy(hdr->b_buf, FALSE, TRUE); @@ -1340,6 +1492,10 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } + if (hdr->b_thawed) { + kmem_free(hdr->b_thawed, 1); + hdr->b_thawed = NULL; + } ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -1360,11 +1516,17 @@ arc_buf_free(arc_buf_t *buf, void *tag) kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) + if (hdr->b_datacnt > 1) { arc_buf_destroy(buf, FALSE, TRUE); - else + } else { + ASSERT(buf == hdr->b_buf); + ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_BUF_AVAILABLE; + } mutex_exit(hash_lock); } else if (HDR_IO_IN_PROGRESS(hdr)) { int destroy_hdr; @@ -1381,12 +1543,10 @@ arc_buf_free(arc_buf_t *buf, void *tag) if (destroy_hdr) arc_hdr_destroy(hdr); } else { - if (remove_reference(hdr, NULL, tag) > 0) { - ASSERT(HDR_IO_ERROR(hdr)); + if (remove_reference(hdr, NULL, tag) > 0) arc_buf_destroy(buf, FALSE, TRUE); - } else { + else arc_hdr_destroy(hdr); - } } } @@ -1398,11 +1558,14 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) int no_callback = (buf->b_efunc == NULL); if (hdr->b_state == arc_anon) { + ASSERT(hdr->b_datacnt == 1); arc_buf_free(buf, tag); return (no_callback); } mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT(hdr->b_state != arc_anon); ASSERT(buf->b_data != NULL); @@ -1412,6 +1575,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) arc_buf_destroy(buf, FALSE, TRUE); } else if (no_callback) { ASSERT(hdr->b_buf == buf && buf->b_next == NULL); + ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_BUF_AVAILABLE; } ASSERT(no_callback || hdr->b_datacnt > 1 || @@ -1440,7 +1604,7 @@ arc_buf_size(arc_buf_t *buf) * It may also return without evicting as much space as requested. */ static void * -arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, +arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { arc_state_t *evicted_state; @@ -1464,7 +1628,8 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, if (HDR_IO_IN_PROGRESS(ab) || (spa && ab->b_spa != spa) || (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && - lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { + ddi_get_lbolt() - ab->b_arc_access < + arc_min_prefetch_lifespan)) { skipped++; continue; } @@ -1479,7 +1644,7 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, ASSERT(ab->b_datacnt > 0); while (ab->b_buf) { arc_buf_t *buf = ab->b_buf; - if (!rw_tryenter(&buf->b_lock, RW_WRITER)) { + if (!mutex_tryenter(&buf->b_evict_lock)) { missed += 1; break; } @@ -1501,13 +1666,28 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&arc_eviction_mtx); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); } else { - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); arc_buf_destroy(buf, buf->b_data == stolen, TRUE); } } + + if (ab->b_l2hdr) { + ARCSTAT_INCR(arcstat_evict_l2_cached, + ab->b_size); + } else { + if (l2arc_write_eligible(ab->b_spa, ab)) { + ARCSTAT_INCR(arcstat_evict_l2_eligible, + ab->b_size); + } else { + ARCSTAT_INCR( + arcstat_evict_l2_ineligible, + ab->b_size); + } + } + if (ab->b_datacnt == 0) { arc_change_state(evicted_state, ab, hash_lock); ASSERT(HDR_IN_HASH_TABLE(ab)); @@ -1549,12 +1729,12 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { int64_t todelete = MIN(arc_mru_ghost->arcs_lsize[type], mru_over); - arc_evict_ghost(arc_mru_ghost, NULL, todelete); + arc_evict_ghost(arc_mru_ghost, 0, todelete); } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c); - arc_evict_ghost(arc_mfu_ghost, NULL, todelete); + arc_evict_ghost(arc_mfu_ghost, 0, todelete); } } @@ -1566,22 +1746,32 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, * bytes. Destroy the buffers that are removed. */ static void -arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) +arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t marker; list_t *list = &state->arcs_list[ARC_BUFC_DATA]; kmutex_t *hash_lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; ASSERT(GHOST_STATE(state)); + bzero(&marker, sizeof(marker)); top: mutex_enter(&state->arcs_mtx); for (ab = list_tail(list); ab; ab = ab_prev) { ab_prev = list_prev(list, ab); if (spa && ab->b_spa != spa) continue; + + /* ignore markers */ + if (ab->b_spa == 0) + continue; + hash_lock = HDR_LOCK(ab); + /* caller may be trying to modify this buffer, skip it */ + if (MUTEX_HELD(hash_lock)) + continue; if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(ab)); ASSERT(ab->b_buf == NULL); @@ -1604,15 +1794,21 @@ top: DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); if (bytes >= 0 && bytes_deleted >= bytes) break; - } else { - if (bytes < 0) { - mutex_exit(&state->arcs_mtx); - mutex_enter(hash_lock); - mutex_exit(hash_lock); - goto top; - } + } else if (bytes < 0) { + /* + * Insert a list marker and then wait for the + * hash lock to become available. Once its + * available, restart from where we left off. + */ + list_insert_after(list, ab, &marker); + mutex_exit(&state->arcs_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + mutex_enter(&state->arcs_mtx); + ab_prev = list_prev(list, &marker); + list_remove(list, &marker); + } else bufs_skipped += 1; - } } mutex_exit(&state->arcs_mtx); @@ -1635,61 +1831,64 @@ top: static void arc_adjust(void) { - int64_t top_sz, mru_over, arc_over, todelete; + int64_t adjustment, delta; + + /* + * Adjust MRU size + */ - top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used; + adjustment = MIN((int64_t)(arc_size - arc_c), + (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - + arc_p)); - if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { - int64_t toevict = - MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); - (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA); - top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); + (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); + adjustment -= delta; } - if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { - int64_t toevict = - MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); - (void) arc_evict(arc_mru, NULL, toevict, FALSE, + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); + (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); - top_sz = arc_anon->arcs_size + arc_mru->arcs_size; } - mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; + /* + * Adjust MFU size + */ - if (mru_over > 0) { - if (arc_mru_ghost->arcs_size > 0) { - todelete = MIN(arc_mru_ghost->arcs_size, mru_over); - arc_evict_ghost(arc_mru_ghost, NULL, todelete); - } + adjustment = arc_size - arc_c; + + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { + delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); + (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); + adjustment -= delta; } - if ((arc_over = arc_size - arc_c) > 0) { - int64_t tbl_over; + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t delta = MIN(adjustment, + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); + (void) arc_evict(arc_mfu, 0, delta, FALSE, + ARC_BUFC_METADATA); + } - if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { - int64_t toevict = - MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); - (void) arc_evict(arc_mfu, NULL, toevict, FALSE, - ARC_BUFC_DATA); - arc_over = arc_size - arc_c; - } + /* + * Adjust ghost lists + */ - if (arc_over > 0 && - arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { - int64_t toevict = - MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], - arc_over); - (void) arc_evict(arc_mfu, NULL, toevict, FALSE, - ARC_BUFC_METADATA); - } + adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; - tbl_over = arc_size + arc_mru_ghost->arcs_size + - arc_mfu_ghost->arcs_size - arc_c * 2; + if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { + delta = MIN(arc_mru_ghost->arcs_size, adjustment); + arc_evict_ghost(arc_mru_ghost, 0, delta); + } - if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { - todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); - arc_evict_ghost(arc_mfu_ghost, NULL, todelete); - } + adjustment = + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; + + if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { + delta = MIN(arc_mfu_ghost->arcs_size, adjustment); + arc_evict_ghost(arc_mfu_ghost, 0, delta); } } @@ -1700,9 +1899,9 @@ arc_do_user_evicts(void) while (arc_eviction_list != NULL) { arc_buf_t *buf = arc_eviction_list; arc_eviction_list = buf->b_next; - rw_enter(&buf->b_lock, RW_WRITER); + mutex_enter(&buf->b_evict_lock); buf->b_hdr = NULL; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); mutex_exit(&arc_eviction_mtx); if (buf->b_efunc != NULL) @@ -1723,29 +1922,34 @@ arc_do_user_evicts(void) void arc_flush(spa_t *spa) { + uint64_t guid = 0; + + if (spa) + guid = spa_guid(spa); + while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } - arc_evict_ghost(arc_mru_ghost, spa, -1); - arc_evict_ghost(arc_mfu_ghost, spa, -1); + arc_evict_ghost(arc_mru_ghost, guid, -1); + arc_evict_ghost(arc_mfu_ghost, guid, -1); mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); @@ -1753,8 +1957,6 @@ arc_flush(spa_t *spa) ASSERT(spa || arc_eviction_list == NULL); } -int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ - void arc_shrink(void) { @@ -1787,9 +1989,8 @@ arc_shrink(void) static int arc_reclaim_needed(void) { - uint64_t extra; - #ifdef _KERNEL + uint64_t extra; if (needfree) return (1); @@ -1915,18 +2116,16 @@ arc_reclaim_thread(void) } /* reset the growth delay for every reclaim */ - growtime = lbolt + (arc_grow_retry * hz); + growtime = ddi_get_lbolt() + (arc_grow_retry * hz); arc_kmem_reap_now(last_reclaim); arc_warm = B_TRUE; - } else if (arc_no_grow && lbolt >= growtime) { + } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { arc_no_grow = FALSE; } - if (2 * arc_c < arc_size + - arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) - arc_adjust(); + arc_adjust(); if (arc_eviction_list != NULL) arc_do_user_evicts(); @@ -1934,7 +2133,7 @@ arc_reclaim_thread(void) /* block until needed, or one second, whichever is shorter */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&arc_reclaim_thr_cv, - &arc_reclaim_thr_lock, (lbolt + hz)); + &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); } @@ -1953,6 +2152,7 @@ static void arc_adapt(int bytes, arc_state_t *state) { int mult; + uint64_t arc_p_min = (arc_c >> arc_p_min_shift); if (state == arc_l2c_only) return; @@ -1969,13 +2169,18 @@ arc_adapt(int bytes, arc_state_t *state) if (state == arc_mru_ghost) { mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); + mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ - arc_p = MIN(arc_c, arc_p + bytes * mult); + arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { + uint64_t delta; + mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); + mult = MIN(mult, 10); - arc_p = MAX(0, (int64_t)arc_p - bytes * mult); + delta = MIN(bytes * mult, arc_p); + arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); @@ -2073,10 +2278,11 @@ arc_get_data_buf(arc_buf_t *buf) if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); - arc_space_consume(size); + arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } goto out; @@ -2093,21 +2299,22 @@ arc_get_data_buf(arc_buf_t *buf) if (state == arc_mru || state == arc_anon) { uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_mfu->arcs_lsize[type] > 0 && + state = (arc_mfu->arcs_lsize[type] >= size && arc_p > mru_used) ? arc_mfu : arc_mru; } else { /* MFU cases */ uint64_t mfu_space = arc_c - arc_p; - state = (arc_mru->arcs_lsize[type] > 0 && + state = (arc_mru->arcs_lsize[type] >= size && mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } - if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { + if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); - arc_space_consume(size); + arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } ARCSTAT_BUMP(arcstat_recycle_miss); @@ -2143,6 +2350,8 @@ out: static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) { + clock_t now; + ASSERT(MUTEX_HELD(hash_lock)); if (buf->b_state == arc_anon) { @@ -2153,11 +2362,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) */ ASSERT(buf->b_arc_access == 0); - buf->b_arc_access = lbolt; + buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); arc_change_state(arc_mru, buf, hash_lock); } else if (buf->b_state == arc_mru) { + now = ddi_get_lbolt(); + /* * If this buffer is here because of a prefetch, then either: * - clear the flag if this is a "referencing" read @@ -2173,7 +2384,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) buf->b_flags &= ~ARC_PREFETCH; ARCSTAT_BUMP(arcstat_mru_hits); } - buf->b_arc_access = lbolt; + buf->b_arc_access = now; return; } @@ -2182,13 +2393,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * but it is still in the cache. Move it to the MFU * state. */ - if (lbolt > buf->b_arc_access + ARC_MINTIME) { + if (now > buf->b_arc_access + ARC_MINTIME) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ - buf->b_arc_access = lbolt; + buf->b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } @@ -2211,7 +2422,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); } - buf->b_arc_access = lbolt; + buf->b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, buf, hash_lock); ARCSTAT_BUMP(arcstat_mru_ghost_hits); @@ -2230,7 +2441,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) ASSERT(list_link_active(&buf->b_arc_node)); } ARCSTAT_BUMP(arcstat_mfu_hits); - buf->b_arc_access = lbolt; + buf->b_arc_access = ddi_get_lbolt(); } else if (buf->b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* @@ -2248,7 +2459,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) new_state = arc_mru; } - buf->b_arc_access = lbolt; + buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(new_state, buf, hash_lock); @@ -2258,7 +2469,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * This buffer is on the 2nd Level ARC. */ - buf->b_arc_access = lbolt; + buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } else { @@ -2271,7 +2482,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { - bcopy(buf->b_data, arg, buf->b_hdr->b_size); + if (zio == NULL || zio->io_error == 0) + bcopy(buf->b_data, arg, buf->b_hdr->b_size); VERIFY(arc_buf_remove_ref(buf, arg) == 1); } @@ -2285,6 +2497,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) *bufp = NULL; } else { *bufp = buf; + ASSERT(buf->b_data); } } @@ -2309,7 +2522,7 @@ arc_read_done(zio_t *zio) * reason for it not to be found is if we were freed during the * read. */ - found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, + found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, &hash_lock); ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || @@ -2323,7 +2536,7 @@ arc_read_done(zio_t *zio) /* byteswap if necessary */ callback_list = hdr->b_acb; ASSERT(callback_list != NULL); - if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? byteswap_uint64_array : dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; @@ -2332,6 +2545,16 @@ arc_read_done(zio_t *zio) arc_cksum_compute(buf, B_FALSE); + if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { + /* + * Only call arc_access on anonymous buffers. This is because + * if we've issued an I/O for an evicted buffer, we've already + * called arc_access (to prevent any simultaneous readers from + * getting confused). + */ + arc_access(hdr, hash_lock); + } + /* create copies of the data buffer for the callers */ abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { @@ -2345,8 +2568,11 @@ arc_read_done(zio_t *zio) hdr->b_acb = NULL; hdr->b_flags &= ~ARC_IO_IN_PROGRESS; ASSERT(!HDR_BUF_AVAILABLE(hdr)); - if (abuf == buf) + if (abuf == buf) { + ASSERT(buf->b_efunc == NULL); + ASSERT(hdr->b_datacnt == 1); hdr->b_flags |= ARC_BUF_AVAILABLE; + } ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); @@ -2367,14 +2593,6 @@ arc_read_done(zio_t *zio) cv_broadcast(&hdr->b_cv); if (hash_lock) { - /* - * Only call arc_access on anonymous buffers. This is because - * if we've issued an I/O for an evicted buffer, we've already - * called arc_access (to prevent any simultaneous readers from - * getting confused). - */ - if (zio->io_error == 0 && hdr->b_state == arc_anon) - arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { /* @@ -2428,27 +2646,34 @@ arc_read_done(zio_t *zio) * arc_read_bp. */ int -arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) { int err; - arc_buf_hdr_t *hdr = pbuf->b_hdr; + + if (pbuf == NULL) { + /* + * XXX This happens from traverse callback funcs, for + * the objset_phys_t block. + */ + return (arc_read_nolock(pio, spa, bp, done, private, priority, + zio_flags, arc_flags, zb)); + } ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); - rw_enter(&pbuf->b_lock, RW_READER); + rw_enter(&pbuf->b_data_lock, RW_READER); err = arc_read_nolock(pio, spa, bp, done, private, priority, zio_flags, arc_flags, zb); + rw_exit(&pbuf->b_data_lock); - ASSERT3P(hdr, ==, pbuf->b_hdr); - rw_exit(&pbuf->b_lock); return (err); } int -arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, +arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) { @@ -2456,9 +2681,11 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *buf; kmutex_t *hash_lock; zio_t *rzio; + uint64_t guid = spa_guid(spa); top: - hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), + &hash_lock); if (hdr && hdr->b_datacnt > 0) { *arc_flags |= ARC_CACHED; @@ -2481,7 +2708,7 @@ top: acb->acb_private = private; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, - spa, NULL, NULL, zio_flags); + spa, NULL, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); acb->acb_next = hdr->b_acb; @@ -2512,6 +2739,7 @@ top: } else { buf = arc_buf_clone(buf); } + } else if (*arc_flags & ARC_PREFETCH && refcount_count(&hdr->b_refcnt) == 0) { hdr->b_flags |= ARC_PREFETCH; @@ -2532,7 +2760,8 @@ top: uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; - daddr_t addr; + uint64_t addr; + boolean_t devw = B_FALSE; if (hdr == NULL) { /* this block is not in the cache */ @@ -2541,15 +2770,13 @@ top: buf = arc_buf_alloc(spa, size, private, type); hdr = buf->b_hdr; hdr->b_dva = *BP_IDENTITY(bp); - hdr->b_birth = bp->blk_birth; + hdr->b_birth = BP_PHYSICAL_BIRTH(bp); hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; + buf_discard_identity(hdr); (void) arc_buf_remove_ref(buf, private); goto top; /* restart the IO request */ } @@ -2584,12 +2811,14 @@ top: buf->b_private = NULL; buf->b_next = NULL; hdr->b_buf = buf; - arc_get_data_buf(buf); ASSERT(hdr->b_datacnt == 0); hdr->b_datacnt = 1; - + arc_get_data_buf(buf); + arc_access(hdr, hash_lock); } + ASSERT(!GHOST_STATE(hdr->b_state)); + acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; @@ -2598,19 +2827,9 @@ top: hdr->b_acb = acb; hdr->b_flags |= ARC_IO_IN_PROGRESS; - /* - * If the buffer has been evicted, migrate it to a present state - * before issuing the I/O. Once we drop the hash-table lock, - * the header will be marked as I/O in progress and have an - * attached buffer. At this point, anybody who finds this - * buffer ought to notice that it's legit but has a pending I/O. - */ - - if (GHOST_STATE(hdr->b_state)) - arc_access(hdr, hash_lock); - if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { + devw = hdr->b_l2hdr->b_dev->l2ad_writing; addr = hdr->b_l2hdr->b_daddr; /* * Lock out device removal. @@ -2623,14 +2842,14 @@ top: mutex_exit(hash_lock); ASSERT3U(hdr->b_size, ==, size); - DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, - zbookmark_t *, zb); + DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, + uint64_t, size, zbookmark_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, misses); - if (vd != NULL) { + if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. @@ -2638,9 +2857,11 @@ top: * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. + * 5. This isn't prefetch and l2arc_noprefetch is set. */ if (hdr->b_l2hdr != NULL && - !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && + !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); @@ -2666,6 +2887,7 @@ top: ZIO_FLAG_DONT_RETRY, B_FALSE); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); + ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_NOWAIT) { zio_nowait(rzio); @@ -2685,6 +2907,14 @@ top: ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } + } else { + if (vd != NULL) + spa_config_exit(spa, SCL_L2ARC, vd); + if (l2arc_ndev != 0) { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + } } rzio = zio_read(pio, spa, bp, buf->b_data, size, @@ -2699,46 +2929,15 @@ top: return (0); } -/* - * arc_read() variant to support pool traversal. If the block is already - * in the ARC, make a copy of it; otherwise, the caller will do the I/O. - * The idea is that we don't want pool traversal filling up memory, but - * if the ARC already has the data anyway, we shouldn't pay for the I/O. - */ -int -arc_tryread(spa_t *spa, blkptr_t *bp, void *data) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_mtx; - int rc = 0; - - hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); - - if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { - arc_buf_t *buf = hdr->b_buf; - - ASSERT(buf); - while (buf->b_data == NULL) { - buf = buf->b_next; - ASSERT(buf); - } - bcopy(buf->b_data, data, hdr->b_size); - } else { - rc = ENOENT; - } - - if (hash_mtx) - mutex_exit(hash_mtx); - - return (rc); -} - void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { ASSERT(buf->b_hdr != NULL); ASSERT(buf->b_hdr->b_state != arc_anon); ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); + ASSERT(buf->b_efunc == NULL); + ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); + buf->b_efunc = func; buf->b_private = private; } @@ -2755,14 +2954,14 @@ arc_buf_evict(arc_buf_t *buf) kmutex_t *hash_lock; arc_buf_t **bufp; - rw_enter(&buf->b_lock, RW_WRITER); + mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(). */ ASSERT(buf->b_data == NULL); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (0); } else if (buf->b_data == NULL) { arc_buf_t copy = *buf; /* structure assignment */ @@ -2771,14 +2970,15 @@ arc_buf_evict(arc_buf_t *buf) * but let arc_do_user_evicts() do the reaping. */ buf->b_efunc = NULL; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); VERIFY(copy.b_efunc(©) == 0); return (1); } hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(buf->b_hdr == hdr); ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); @@ -2797,6 +2997,7 @@ arc_buf_evict(arc_buf_t *buf) arc_state_t *old_state = hdr->b_state; arc_state_t *evicted_state; + ASSERT(hdr->b_buf == NULL); ASSERT(refcount_is_zero(&hdr->b_refcnt)); evicted_state = @@ -2814,12 +3015,13 @@ arc_buf_evict(arc_buf_t *buf) mutex_exit(&old_state->arcs_mtx); } mutex_exit(hash_lock); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); VERIFY(buf->b_efunc(buf) == 0); buf->b_efunc = NULL; buf->b_private = NULL; buf->b_hdr = NULL; + buf->b_next = NULL; kmem_cache_free(buf_cache, buf); return (1); } @@ -2834,30 +3036,32 @@ void arc_release(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; + kmutex_t *hash_lock = NULL; l2arc_buf_hdr_t *l2hdr; uint64_t buf_size; - rw_enter(&buf->b_lock, RW_WRITER); + /* + * It would be nice to assert that if it's DMU metadata (level > + * 0 || it's the dnode file), then it must be syncing context. + * But we don't know that information at this level. + */ + + mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; /* this buffer is not on any list */ ASSERT(refcount_count(&hdr->b_refcnt) > 0); - ASSERT(!(hdr->b_flags & ARC_STORED)); if (hdr->b_state == arc_anon) { /* this buffer is already released */ - ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); - ASSERT(BUF_EMPTY(hdr)); ASSERT(buf->b_efunc == NULL); - arc_buf_thaw(buf); - rw_exit(&buf->b_lock); - return; + } else { + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); } - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - l2hdr = hdr->b_l2hdr; if (l2hdr) { mutex_enter(&l2arc_buflist_mtx); @@ -2872,20 +3076,20 @@ arc_release(arc_buf_t *buf, void *tag) arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; - spa_t *spa = hdr->b_spa; + uint64_t spa = hdr->b_spa; arc_buf_contents_t type = hdr->b_type; uint32_t flags = hdr->b_flags; ASSERT(hdr->b_buf != buf || buf->b_next != NULL); /* - * Pull the data off of this buf and attach it to - * a new anonymous buf. + * Pull the data off of this hdr and attach it to + * a new anonymous hdr. */ (void) remove_reference(hdr, hash_lock, tag); bufp = &hdr->b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; - *bufp = (*bufp)->b_next; + *bufp = buf->b_next; buf->b_next = NULL; ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); @@ -2913,20 +3117,20 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_freeze_cksum = NULL; (void) refcount_add(&nhdr->b_refcnt, tag); buf->b_hdr = nhdr; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); atomic_add_64(&arc_anon->arcs_size, blksz); } else { - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_refcnt) == 1); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - arc_change_state(arc_anon, hdr, hash_lock); + if (hdr->b_state != arc_anon) + arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; - mutex_exit(hash_lock); + if (hash_lock) + mutex_exit(hash_lock); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; + buf_discard_identity(hdr); arc_buf_thaw(buf); } buf->b_efunc = NULL; @@ -2940,14 +3144,27 @@ arc_release(arc_buf_t *buf, void *tag) } } +/* + * Release this buffer. If it does not match the provided BP, fill it + * with that block's contents. + */ +/* ARGSUSED */ +int +arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, + zbookmark_t *zb) +{ + arc_release(buf, tag); + return (0); +} + int arc_released(arc_buf_t *buf) { int released; - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (released); } @@ -2956,9 +3173,9 @@ arc_has_callback(arc_buf_t *buf) { int callback; - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); callback = (buf->b_efunc != NULL); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (callback); } @@ -2968,9 +3185,9 @@ arc_referenced(arc_buf_t *buf) { int referenced; - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); referenced = (refcount_count(&buf->b_hdr->b_refcnt)); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (referenced); } #endif @@ -3010,21 +3227,28 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - hdr->b_acb = NULL; + ASSERT(hdr->b_acb == NULL); + + if (zio->io_error == 0) { + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); + hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + } else { + ASSERT(BUF_EMPTY(hdr)); + } - hdr->b_dva = *BP_IDENTITY(zio->io_bp); - hdr->b_birth = zio->io_bp->blk_birth; - hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; /* * If the block to be written was all-zero, we may have * compressed it away. In this case no write was performed - * so there will be no dva/birth-date/checksum. The buffer - * must therefor remain anonymous (and uncached). + * so there will be no dva/birth/checksum. The buffer must + * therefore remain anonymous (and uncached). */ if (!BUF_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; + ASSERT(zio->io_error == 0); + arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); @@ -3034,106 +3258,54 @@ arc_write_done(zio_t *zio) * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ - ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE); - ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), - BP_IDENTITY(zio->io_bp))); - ASSERT3U(zio->io_bp_orig.blk_birth, ==, - zio->io_bp->blk_birth); - - ASSERT(refcount_is_zero(&exists->b_refcnt)); - arc_change_state(arc_anon, exists, hash_lock); - mutex_exit(hash_lock); - arc_hdr_destroy(exists); - exists = buf_hash_insert(hdr, &hash_lock); - ASSERT3P(exists, ==, NULL); + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) + panic("bad overwrite, hdr=%p exists=%p", + (void *)hdr, (void *)exists); + ASSERT(refcount_is_zero(&exists->b_refcnt)); + arc_change_state(arc_anon, exists, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(exists); + exists = buf_hash_insert(hdr, &hash_lock); + ASSERT3P(exists, ==, NULL); + } else { + /* Dedup */ + ASSERT(hdr->b_datacnt == 1); + ASSERT(hdr->b_state == arc_anon); + ASSERT(BP_GET_DEDUP(zio->io_bp)); + ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); + } } hdr->b_flags &= ~ARC_IO_IN_PROGRESS; /* if it's not anon, we are doing a scrub */ - if (hdr->b_state == arc_anon) + if (!exists && hdr->b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); - } else if (callback->awcb_done == NULL) { - int destroy_hdr; - /* - * This is an anonymous buffer with no user callback, - * destroy it if there are no active references. - */ - mutex_enter(&arc_eviction_mtx); - destroy_hdr = refcount_is_zero(&hdr->b_refcnt); - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; - mutex_exit(&arc_eviction_mtx); - if (destroy_hdr) - arc_hdr_destroy(hdr); } else { hdr->b_flags &= ~ARC_IO_IN_PROGRESS; } - hdr->b_flags &= ~ARC_STORED; - if (callback->awcb_done) { - ASSERT(!refcount_is_zero(&hdr->b_refcnt)); - callback->awcb_done(zio, buf, callback->awcb_private); - } + ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + callback->awcb_done(zio, buf, callback->awcb_private); kmem_free(callback, sizeof (arc_write_callback_t)); } -void -write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp) -{ - boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata); - - /* Determine checksum setting */ - if (ismd) { - /* - * Metadata always gets checksummed. If the data - * checksum is multi-bit correctable, and it's not a - * ZBT-style checksum, then it's suitable for metadata - * as well. Otherwise, the metadata checksum defaults - * to fletcher4. - */ - if (zio_checksum_table[wp->wp_oschecksum].ci_correctable && - !zio_checksum_table[wp->wp_oschecksum].ci_zbt) - zp->zp_checksum = wp->wp_oschecksum; - else - zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4; - } else { - zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum, - wp->wp_oschecksum); - } - - /* Determine compression setting */ - if (ismd) { - /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. - */ - zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : - ZIO_COMPRESS_LZJB; - } else { - zp->zp_compress = zio_compress_select(wp->wp_dncompress, - wp->wp_oscompress); - } - - zp->zp_type = wp->wp_type; - zp->zp_level = wp->wp_level; - zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa)); -} - zio_t * -arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, - boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, - int zio_flags, const zbookmark_t *zb) +arc_write(zio_t *pio, spa_t *spa, uint64_t txg, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, + arc_done_func_t *ready, arc_done_func_t *done, void *private, + int priority, int zio_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; - zio_prop_t zp; ASSERT(ready != NULL); + ASSERT(done != NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); - ASSERT(hdr->b_acb == 0); + ASSERT(hdr->b_acb == NULL); if (l2arc) hdr->b_flags |= ARC_L2CACHE; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); @@ -3142,92 +3314,16 @@ arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, callback->awcb_private = private; callback->awcb_buf = buf; - write_policy(spa, wp, &zp); - zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp, + zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); return (zio); } -int -arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, uint32_t arc_flags) -{ - arc_buf_hdr_t *ab; - kmutex_t *hash_lock; - zio_t *zio; - - /* - * If this buffer is in the cache, release it, so it - * can be re-used. - */ - ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); - if (ab != NULL) { - /* - * The checksum of blocks to free is not always - * preserved (eg. on the deadlist). However, if it is - * nonzero, it should match what we have in the cache. - */ - ASSERT(bp->blk_cksum.zc_word[0] == 0 || - bp->blk_cksum.zc_word[0] == ab->b_cksum0 || - bp->blk_fill == BLK_FILL_ALREADY_FREED); - - if (ab->b_state != arc_anon) - arc_change_state(arc_anon, ab, hash_lock); - if (HDR_IO_IN_PROGRESS(ab)) { - /* - * This should only happen when we prefetch. - */ - ASSERT(ab->b_flags & ARC_PREFETCH); - ASSERT3U(ab->b_datacnt, ==, 1); - ab->b_flags |= ARC_FREED_IN_READ; - if (HDR_IN_HASH_TABLE(ab)) - buf_hash_remove(ab); - ab->b_arc_access = 0; - bzero(&ab->b_dva, sizeof (dva_t)); - ab->b_birth = 0; - ab->b_cksum0 = 0; - ab->b_buf->b_efunc = NULL; - ab->b_buf->b_private = NULL; - mutex_exit(hash_lock); - } else if (refcount_is_zero(&ab->b_refcnt)) { - ab->b_flags |= ARC_FREE_IN_PROGRESS; - mutex_exit(hash_lock); - arc_hdr_destroy(ab); - ARCSTAT_BUMP(arcstat_deleted); - } else { - /* - * We still have an active reference on this - * buffer. This can happen, e.g., from - * dbuf_unoverride(). - */ - ASSERT(!HDR_IN_HASH_TABLE(ab)); - ab->b_arc_access = 0; - bzero(&ab->b_dva, sizeof (dva_t)); - ab->b_birth = 0; - ab->b_cksum0 = 0; - ab->b_buf->b_efunc = NULL; - ab->b_buf->b_private = NULL; - mutex_exit(hash_lock); - } - } - - zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED); - - if (arc_flags & ARC_WAIT) - return (zio_wait(zio)); - - ASSERT(arc_flags & ARC_NOWAIT); - zio_nowait(zio); - - return (0); -} - static int -arc_memory_throttle(uint64_t reserve, uint64_t txg) +arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) { #ifdef _KERNEL - uint64_t inflight_data = arc_anon->arcs_size; uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; @@ -3289,6 +3385,7 @@ int arc_tempreserve_space(uint64_t reserve, uint64_t txg) { int error; + uint64_t anon_size; #ifdef ZFS_DEBUG /* @@ -3305,11 +3402,18 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) return (ENOMEM); /* + * Don't count loaned bufs as in flight dirty data to prevent long + * network delays from blocking transactions that are ready to be + * assigned to a txg. + */ + anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); + + /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefor need to * make sure that there is sufficient available memory for this. */ - if (error = arc_memory_throttle(reserve, txg)) + if ((error = arc_memory_throttle(reserve, anon_size, txg))) return (error); /* @@ -3319,8 +3423,9 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ - if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && - arc_anon->arcs_size > arc_c / 4) { + + if (reserve + arc_tempreserve + anon_size > arc_c / 2 && + anon_size > arc_c / 4) { dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", arc_tempreserve>>10, @@ -3385,6 +3490,15 @@ arc_init(void) if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; + if (zfs_arc_grow_retry > 0) + arc_grow_retry = zfs_arc_grow_retry; + + if (zfs_arc_shrink_shift > 0) + arc_shrink_shift = zfs_arc_shrink_shift; + + if (zfs_arc_p_min_shift > 0) + arc_p_min_shift = zfs_arc_p_min_shift; + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -3491,10 +3605,13 @@ arc_fini(void) mutex_destroy(&arc_mru_ghost->arcs_mtx); mutex_destroy(&arc_mfu->arcs_mtx); mutex_destroy(&arc_mfu_ghost->arcs_mtx); + mutex_destroy(&arc_l2c_only->arcs_mtx); mutex_destroy(&zfs_write_limit_lock); buf_fini(); + + ASSERT(arc_loaned_bytes == 0); } /* @@ -3622,8 +3739,70 @@ arc_fini(void) * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. + * + * There are three key functions that control how the L2ARC warms up: + * + * l2arc_write_eligible() check if a buffer is eligible to cache + * l2arc_write_size() calculate how much to write + * l2arc_write_interval() calculate sleep delay between writes + * + * These three functions determine what to write, how much, and how quickly + * to send writes. */ +static boolean_t +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) +{ + /* + * A buffer is *not* eligible for the L2ARC if it: + * 1. belongs to a different spa. + * 2. is already cached on the L2ARC. + * 3. has an I/O in progress (it may be an incomplete read). + * 4. is flagged not eligible (zfs property). + */ + if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || + HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) + return (B_FALSE); + + return (B_TRUE); +} + +static uint64_t +l2arc_write_size(l2arc_dev_t *dev) +{ + uint64_t size; + + size = dev->l2ad_write; + + if (arc_warm == B_FALSE) + size += dev->l2ad_boost; + + return (size); + +} + +static clock_t +l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) +{ + clock_t interval, next, now; + + /* + * If the ARC lists are busy, increase our write rate; if the + * lists are stale, idle back. This is achieved by checking + * how much we previously wrote - if it was more than half of + * what we wanted, schedule the next write much sooner. + */ + if (l2arc_feed_again && wrote > (wanted / 2)) + interval = (hz * l2arc_feed_min_ms) / 1000; + else + interval = hz * l2arc_feed_secs; + + now = ddi_get_lbolt(); + next = MAX(now, MIN(now + interval, began + interval)); + + return (next); +} + static void l2arc_hdr_stat_add(void) { @@ -3703,7 +3882,7 @@ out: * Free buffers that were tagged for destruction. */ static void -l2arc_do_free_on_write() +l2arc_do_free_on_write(void) { list_t *buflist; l2arc_data_free_t *df, *df_prev; @@ -3821,11 +4000,11 @@ l2arc_read_done(zio_t *zio) ASSERT(cb != NULL); buf = cb->l2rcb_buf; ASSERT(buf != NULL); - hdr = buf->b_hdr; - ASSERT(hdr != NULL); - hash_lock = HDR_LOCK(hdr); + hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * Check this survived the L2ARC journey. @@ -3856,11 +4035,15 @@ l2arc_read_done(zio_t *zio) * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ - if (zio->io_waiter == NULL) - zio_nowait(zio_read(zio->io_parent, - cb->l2rcb_spa, &cb->l2rcb_bp, + if (zio->io_waiter == NULL) { + zio_t *pio = zio_unique_parent(zio); + + ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); + + zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, buf->b_data, zio->io_size, arc_read_done, buf, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + } } kmem_free(cb, sizeof (l2arc_read_callback_t)); @@ -4034,7 +4217,7 @@ top: } mutex_exit(&l2arc_buflist_mtx); - spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict)); + vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); dev->l2ad_evict = taddr; } @@ -4044,7 +4227,7 @@ top: * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. */ -static void +static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *ab, *ab_prev, *head; @@ -4056,6 +4239,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) boolean_t have_lock, full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; + uint64_t guid = spa_guid(spa); + int try; ASSERT(dev->l2ad_vdev != NULL); @@ -4069,7 +4254,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) * Copy buffers for L2ARC writing. */ mutex_enter(&l2arc_buflist_mtx); - for (int try = 0; try <= 3; try++) { + for (try = 0; try <= 3; try++) { list = l2arc_list_locked(try, &list_lock); passed_sz = 0; @@ -4109,20 +4294,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } - if (ab->b_spa != spa) { - mutex_exit(hash_lock); - continue; - } - - if (ab->b_l2hdr != NULL) { - /* - * Already in L2ARC. - */ - mutex_exit(hash_lock); - continue; - } - - if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) { + if (!l2arc_write_eligible(guid, ab)) { mutex_exit(hash_lock); continue; } @@ -4133,12 +4305,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } - if (ab->b_buf == NULL) { - DTRACE_PROBE1(l2arc__buf__null, void *, ab); - mutex_exit(hash_lock); - continue; - } - if (pio == NULL) { /* * Insert a dummy header on the buflist so @@ -4205,27 +4371,32 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) if (pio == NULL) { ASSERT3U(write_sz, ==, 0); kmem_cache_free(hdr_cache, head); - return; + return (0); } ASSERT3U(write_sz, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); + ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); ARCSTAT_INCR(arcstat_l2_size, write_sz); - spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz); + vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { - spa_l2cache_space_update(dev->l2ad_vdev, 0, - dev->l2ad_end - dev->l2ad_hand); + vdev_space_update(dev->l2ad_vdev, + dev->l2ad_end - dev->l2ad_hand, 0, 0); dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; } + dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); + dev->l2ad_writing = B_FALSE; + + return (write_sz); } /* @@ -4238,20 +4409,19 @@ l2arc_feed_thread(void) callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; - uint64_t size; + uint64_t size, wrote; + clock_t begin, next = ddi_get_lbolt(); CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); while (l2arc_thread_exit == 0) { - /* - * Pause for l2arc_feed_secs seconds between writes. - */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, - lbolt + (hz * l2arc_feed_secs)); + next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); + next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. @@ -4262,6 +4432,7 @@ l2arc_feed_thread(void) continue; } mutex_exit(&l2arc_dev_mtx); + begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in @@ -4280,6 +4451,16 @@ l2arc_feed_thread(void) ASSERT(spa != NULL); /* + * If the pool is read-only then force the feed thread to + * sleep a little longer. + */ + if (!spa_writeable(spa)) { + next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + /* * Avoid contributing to memory pressure. */ if (arc_reclaim_needed()) { @@ -4290,9 +4471,7 @@ l2arc_feed_thread(void) ARCSTAT_BUMP(arcstat_l2_feeds); - size = dev->l2ad_write; - if (arc_warm == B_FALSE) - size += dev->l2ad_boost; + size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. @@ -4302,7 +4481,12 @@ l2arc_feed_thread(void) /* * Write ARC buffers. */ - l2arc_write_buffers(spa, dev, size); + wrote = l2arc_write_buffers(spa, dev, size); + + /* + * Calculate interval between writes. + */ + next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } @@ -4333,7 +4517,7 @@ l2arc_vdev_present(vdev_t *vd) * validated the vdev and opened it. */ void -l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) +l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; @@ -4347,11 +4531,12 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) adddev->l2ad_vdev = vd; adddev->l2ad_write = l2arc_write_max; adddev->l2ad_boost = l2arc_write_boost; - adddev->l2ad_start = start; - adddev->l2ad_end = end; + adddev->l2ad_start = VDEV_LABEL_START_SIZE; + adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; + adddev->l2ad_writing = B_FALSE; ASSERT3U(adddev->l2ad_write, >, 0); /* @@ -4362,7 +4547,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2node)); - spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0); + vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); /* * Add device to global list @@ -4457,7 +4642,7 @@ l2arc_fini(void) void l2arc_start(void) { - if (!(spa_mode & FWRITE)) + if (!(spa_mode_global & FWRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, @@ -4467,7 +4652,7 @@ l2arc_start(void) void l2arc_stop(void) { - if (!(spa_mode & FWRITE)) + if (!(spa_mode_global & FWRITE)) return; mutex_enter(&l2arc_feed_thr_lock);