X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Farc.c;h=d5e5aa544bfd09112f53a37e694208a1b9eb70cd;hb=fa42225a3d5daa58704bdb920ec92cd95c274011;hp=464fe9fddf8b92c0d925ea7ce45e7533fa99389c;hpb=fb5f0bc83330c8a0236c4d34a23723ac1974971a;p=zfs.git diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 464fe9f..d5e5aa5 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -124,6 +124,7 @@ #include #include #include +#include #ifdef _KERNEL #include #include @@ -152,6 +153,12 @@ typedef enum arc_reclaim_strategy { /* number of seconds before growing cache again */ static int arc_grow_retry = 60; +/* shift of arc_c for calculating both min and max arc_p */ +static int arc_p_min_shift = 4; + +/* log2(fraction of arc to reclaim) */ +static int arc_shrink_shift = 5; + /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) @@ -172,6 +179,9 @@ uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; int zfs_mdcomp_disable = 0; +int zfs_arc_grow_retry = 0; +int zfs_arc_shrink_shift = 0; +int zfs_arc_p_min_shift = 0; /* * Note that buffers can be in one of 6 states: @@ -250,10 +260,14 @@ typedef struct arc_stats { kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; kstat_named_t arcstat_hdr_size; + kstat_named_t arcstat_data_size; + kstat_named_t arcstat_other_size; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_read_bytes; + kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; @@ -299,10 +313,14 @@ static arc_stats_t arc_stats = { { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, + { "data_size", KSTAT_DATA_UINT64 }, + { "other_size", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, + { "l2_read_bytes", KSTAT_DATA_UINT64 }, + { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, @@ -380,6 +398,7 @@ static arc_state_t *arc_l2c_only; static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; +static uint64_t arc_loaned_bytes; static uint64_t arc_meta_used; static uint64_t arc_meta_limit; static uint64_t arc_meta_max = 0; @@ -425,7 +444,7 @@ struct arc_buf_hdr { /* immutable */ arc_buf_contents_t b_type; uint64_t b_size; - spa_t *b_spa; + uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; @@ -447,7 +466,7 @@ static arc_buf_hdr_t arc_eviction_hdr; static void arc_get_data_buf(arc_buf_t *buf); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); static int arc_evict_needed(arc_buf_contents_t type); -static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); +static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ @@ -476,6 +495,7 @@ static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) +#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) @@ -529,8 +549,9 @@ uint64_t zfs_crc64_table[256]; */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ -#define L2ARC_HEADROOM 4 /* num of writes */ -#define L2ARC_FEED_SECS 1 /* caching interval */ +#define L2ARC_HEADROOM 2 /* num of writes */ +#define L2ARC_FEED_SECS 1 /* caching interval secs */ +#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -542,7 +563,10 @@ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ +boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ /* * L2ARC Internals @@ -557,6 +581,7 @@ typedef struct l2arc_dev { uint64_t l2ad_end; /* last addr on device */ uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ list_t *l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ } l2arc_dev_t; @@ -587,7 +612,7 @@ typedef struct l2arc_write_callback { struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ - daddr_t b_daddr; /* disk address, offset byte */ + uint64_t b_daddr; /* disk address, offset byte */ }; typedef struct l2arc_data_free { @@ -607,9 +632,8 @@ static void l2arc_hdr_stat_add(void); static void l2arc_hdr_stat_remove(void); static uint64_t -buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) +buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { - uintptr_t spav = (uintptr_t)spa; uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i; @@ -619,7 +643,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) for (i = 0; i < sizeof (dva_t); i++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; - crc ^= (spav>>8) ^ birth; + crc ^= (spa>>8) ^ birth; return (crc); } @@ -635,7 +659,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) ((buf)->b_birth == birth) && ((buf)->b_spa == spa) static arc_buf_hdr_t * -buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) +buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); @@ -755,8 +779,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag) refcount_create(&buf->b_refcnt); cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); - ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); return (0); } @@ -768,6 +792,8 @@ buf_cons(void *vbuf, void *unused, int kmflag) bzero(buf, sizeof (arc_buf_t)); rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); + return (0); } @@ -784,8 +810,7 @@ hdr_dest(void *vbuf, void *unused) refcount_destroy(&buf->b_refcnt); cv_destroy(&buf->b_cv); mutex_destroy(&buf->b_freeze_lock); - - ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); + arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); } /* ARGSUSED */ @@ -795,6 +820,7 @@ buf_dest(void *vbuf, void *unused) arc_buf_t *buf = vbuf; rw_destroy(&buf->b_lock); + arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } /* @@ -1081,15 +1107,49 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } void -arc_space_consume(uint64_t space) +arc_space_consume(uint64_t space, arc_space_type_t type) { + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + case ARC_SPACE_DATA: + ARCSTAT_INCR(arcstat_data_size, space); + break; + case ARC_SPACE_OTHER: + ARCSTAT_INCR(arcstat_other_size, space); + break; + case ARC_SPACE_HDRS: + ARCSTAT_INCR(arcstat_hdr_size, space); + break; + case ARC_SPACE_L2HDRS: + ARCSTAT_INCR(arcstat_l2_hdr_size, space); + break; + } + atomic_add_64(&arc_meta_used, space); atomic_add_64(&arc_size, space); } void -arc_space_return(uint64_t space) +arc_space_return(uint64_t space, arc_space_type_t type) { + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + case ARC_SPACE_DATA: + ARCSTAT_INCR(arcstat_data_size, -space); + break; + case ARC_SPACE_OTHER: + ARCSTAT_INCR(arcstat_other_size, -space); + break; + case ARC_SPACE_HDRS: + ARCSTAT_INCR(arcstat_hdr_size, -space); + break; + case ARC_SPACE_L2HDRS: + ARCSTAT_INCR(arcstat_l2_hdr_size, -space); + break; + } + ASSERT(arc_meta_used >= space); if (arc_meta_max < arc_meta_used) arc_meta_max = arc_meta_used; @@ -1126,7 +1186,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; - hdr->b_spa = spa; + hdr->b_spa = spa_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); @@ -1145,6 +1205,41 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) return (buf); } +static char *arc_onloan_tag = "onloan"; + +/* + * Loan out an anonymous arc buffer. Loaned buffers are not counted as in + * flight data by arc_tempreserve_space() until they are "returned". Loaned + * buffers must be returned to the arc before they can be used by the DMU or + * freed. + */ +arc_buf_t * +arc_loan_buf(spa_t *spa, int size) +{ + arc_buf_t *buf; + + buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + + atomic_add_64(&arc_loaned_bytes, size); + return (buf); +} + +/* + * Return a loaned arc buffer to the arc. + */ +void +arc_return_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(hdr->b_state == arc_anon); + ASSERT(buf->b_data != NULL); + VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0); + VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1); + + atomic_add_64(&arc_loaned_bytes, -hdr->b_size); +} + static arc_buf_t * arc_buf_clone(arc_buf_t *from) { @@ -1189,6 +1284,7 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); add_reference(hdr, hash_lock, tag); + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); @@ -1236,11 +1332,12 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) if (type == ARC_BUFC_METADATA) { arc_buf_data_free(buf->b_hdr, zio_buf_free, buf->b_data, size); - arc_space_return(size); + arc_space_return(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); arc_buf_data_free(buf->b_hdr, zio_data_buf_free, buf->b_data, size); + ARCSTAT_INCR(arcstat_data_size, -size); atomic_add_64(&arc_size, -size); } } @@ -1440,7 +1537,7 @@ arc_buf_size(arc_buf_t *buf) * It may also return without evicting as much space as requested. */ static void * -arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, +arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { arc_state_t *evicted_state; @@ -1566,7 +1663,7 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, * bytes. Destroy the buffers that are removed. */ static void -arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) +arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; list_t *list = &state->arcs_list[ARC_BUFC_DATA]; @@ -1635,61 +1732,63 @@ top: static void arc_adjust(void) { - int64_t top_sz, mru_over, arc_over, todelete; + int64_t adjustment, delta; - top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used; + /* + * Adjust MRU size + */ + + adjustment = MIN(arc_size - arc_c, + arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p); - if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { - int64_t toevict = - MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); - (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA); - top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); + (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); + adjustment -= delta; } - if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { - int64_t toevict = - MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); - (void) arc_evict(arc_mru, NULL, toevict, FALSE, + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); + (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_METADATA); - top_sz = arc_anon->arcs_size + arc_mru->arcs_size; } - mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; + /* + * Adjust MFU size + */ + + adjustment = arc_size - arc_c; - if (mru_over > 0) { - if (arc_mru_ghost->arcs_size > 0) { - todelete = MIN(arc_mru_ghost->arcs_size, mru_over); - arc_evict_ghost(arc_mru_ghost, NULL, todelete); - } + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { + delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); + (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); + adjustment -= delta; } - if ((arc_over = arc_size - arc_c) > 0) { - int64_t tbl_over; + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t delta = MIN(adjustment, + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); + (void) arc_evict(arc_mfu, NULL, delta, FALSE, + ARC_BUFC_METADATA); + } - if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { - int64_t toevict = - MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); - (void) arc_evict(arc_mfu, NULL, toevict, FALSE, - ARC_BUFC_DATA); - arc_over = arc_size - arc_c; - } + /* + * Adjust ghost lists + */ - if (arc_over > 0 && - arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { - int64_t toevict = - MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], - arc_over); - (void) arc_evict(arc_mfu, NULL, toevict, FALSE, - ARC_BUFC_METADATA); - } + adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; - tbl_over = arc_size + arc_mru_ghost->arcs_size + - arc_mfu_ghost->arcs_size - arc_c * 2; + if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { + delta = MIN(arc_mru_ghost->arcs_size, adjustment); + arc_evict_ghost(arc_mru_ghost, NULL, delta); + } - if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { - todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); - arc_evict_ghost(arc_mfu_ghost, NULL, todelete); - } + adjustment = + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; + + if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { + delta = MIN(arc_mfu_ghost->arcs_size, adjustment); + arc_evict_ghost(arc_mfu_ghost, NULL, delta); } } @@ -1723,29 +1822,34 @@ arc_do_user_evicts(void) void arc_flush(spa_t *spa) { + uint64_t guid = 0; + + if (spa) + guid = spa_guid(spa); + while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } - arc_evict_ghost(arc_mru_ghost, spa, -1); - arc_evict_ghost(arc_mfu_ghost, spa, -1); + arc_evict_ghost(arc_mru_ghost, guid, -1); + arc_evict_ghost(arc_mfu_ghost, guid, -1); mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); @@ -1753,8 +1857,6 @@ arc_flush(spa_t *spa) ASSERT(spa || arc_eviction_list == NULL); } -int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ - void arc_shrink(void) { @@ -1953,6 +2055,7 @@ static void arc_adapt(int bytes, arc_state_t *state) { int mult; + uint64_t arc_p_min = (arc_c >> arc_p_min_shift); if (state == arc_l2c_only) return; @@ -1970,12 +2073,15 @@ arc_adapt(int bytes, arc_state_t *state) mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); - arc_p = MIN(arc_c, arc_p + bytes * mult); + arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { + uint64_t delta; + mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); - arc_p = MAX(0, (int64_t)arc_p - bytes * mult); + delta = MIN(bytes * mult, arc_p); + arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); @@ -2073,10 +2179,11 @@ arc_get_data_buf(arc_buf_t *buf) if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); - arc_space_consume(size); + arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } goto out; @@ -2093,21 +2200,22 @@ arc_get_data_buf(arc_buf_t *buf) if (state == arc_mru || state == arc_anon) { uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_mfu->arcs_lsize[type] > 0 && + state = (arc_mfu->arcs_lsize[type] >= size && arc_p > mru_used) ? arc_mfu : arc_mru; } else { /* MFU cases */ uint64_t mfu_space = arc_c - arc_p; - state = (arc_mru->arcs_lsize[type] > 0 && + state = (arc_mru->arcs_lsize[type] >= size && mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); - arc_space_consume(size); + arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } ARCSTAT_BUMP(arcstat_recycle_miss); @@ -2309,7 +2417,7 @@ arc_read_done(zio_t *zio) * reason for it not to be found is if we were freed during the * read. */ - found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, + found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, &hash_lock); ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || @@ -2433,7 +2541,6 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, uint32_t *arc_flags, const zbookmark_t *zb) { int err; - arc_buf_hdr_t *hdr = pbuf->b_hdr; ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); @@ -2441,9 +2548,8 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, err = arc_read_nolock(pio, spa, bp, done, private, priority, zio_flags, arc_flags, zb); - - ASSERT3P(hdr, ==, pbuf->b_hdr); rw_exit(&pbuf->b_lock); + return (err); } @@ -2456,9 +2562,10 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *buf; kmutex_t *hash_lock; zio_t *rzio; + uint64_t guid = spa_guid(spa); top: - hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); if (hdr && hdr->b_datacnt > 0) { *arc_flags |= ARC_CACHED; @@ -2481,7 +2588,7 @@ top: acb->acb_private = private; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, - spa, NULL, NULL, zio_flags); + spa, NULL, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); acb->acb_next = hdr->b_acb; @@ -2532,7 +2639,8 @@ top: uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; - daddr_t addr; + uint64_t addr; + boolean_t devw = B_FALSE; if (hdr == NULL) { /* this block is not in the cache */ @@ -2611,6 +2719,7 @@ top: if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { + devw = hdr->b_l2hdr->b_dev->l2ad_writing; addr = hdr->b_l2hdr->b_daddr; /* * Lock out device removal. @@ -2630,7 +2739,7 @@ top: demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, misses); - if (vd != NULL) { + if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. @@ -2638,9 +2747,11 @@ top: * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. + * 5. This isn't prefetch and l2arc_noprefetch is set. */ if (hdr->b_l2hdr != NULL && - !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && + !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); @@ -2666,6 +2777,7 @@ top: ZIO_FLAG_DONT_RETRY, B_FALSE); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); + ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_NOWAIT) { zio_nowait(rzio); @@ -2685,6 +2797,14 @@ top: ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } + } else { + if (vd != NULL) + spa_config_exit(spa, SCL_L2ARC, vd); + if (l2arc_ndev != 0) { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + } } rzio = zio_read(pio, spa, bp, buf->b_data, size, @@ -2710,9 +2830,10 @@ arc_tryread(spa_t *spa, blkptr_t *bp, void *data) { arc_buf_hdr_t *hdr; kmutex_t *hash_mtx; + uint64_t guid = spa_guid(spa); int rc = 0; - hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); + hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { arc_buf_t *buf = hdr->b_buf; @@ -2837,6 +2958,7 @@ arc_release(arc_buf_t *buf, void *tag) kmutex_t *hash_lock; l2arc_buf_hdr_t *l2hdr; uint64_t buf_size; + boolean_t released = B_FALSE; rw_enter(&buf->b_lock, RW_WRITER); hdr = buf->b_hdr; @@ -2852,12 +2974,12 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(buf->b_efunc == NULL); arc_buf_thaw(buf); rw_exit(&buf->b_lock); - return; + released = B_TRUE; + } else { + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); } - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - l2hdr = hdr->b_l2hdr; if (l2hdr) { mutex_enter(&l2arc_buflist_mtx); @@ -2865,6 +2987,9 @@ arc_release(arc_buf_t *buf, void *tag) buf_size = hdr->b_size; } + if (released) + goto out; + /* * Do we have more than one buf? */ @@ -2872,7 +2997,7 @@ arc_release(arc_buf_t *buf, void *tag) arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; - spa_t *spa = hdr->b_spa; + uint64_t spa = hdr->b_spa; arc_buf_contents_t type = hdr->b_type; uint32_t flags = hdr->b_flags; @@ -2932,6 +3057,7 @@ arc_release(arc_buf_t *buf, void *tag) buf->b_efunc = NULL; buf->b_private = NULL; +out: if (l2hdr) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); @@ -3156,12 +3282,13 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_hdr_t *ab; kmutex_t *hash_lock; zio_t *zio; + uint64_t guid = spa_guid(spa); /* * If this buffer is in the cache, release it, so it * can be re-used. */ - ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); if (ab != NULL) { /* * The checksum of blocks to free is not always @@ -3224,10 +3351,9 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, } static int -arc_memory_throttle(uint64_t reserve, uint64_t txg) +arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) { #ifdef _KERNEL - uint64_t inflight_data = arc_anon->arcs_size; uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; @@ -3289,6 +3415,7 @@ int arc_tempreserve_space(uint64_t reserve, uint64_t txg) { int error; + uint64_t anon_size; #ifdef ZFS_DEBUG /* @@ -3305,11 +3432,18 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) return (ENOMEM); /* + * Don't count loaned bufs as in flight dirty data to prevent long + * network delays from blocking transactions that are ready to be + * assigned to a txg. + */ + anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); + + /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefor need to * make sure that there is sufficient available memory for this. */ - if (error = arc_memory_throttle(reserve, txg)) + if (error = arc_memory_throttle(reserve, anon_size, txg)) return (error); /* @@ -3319,8 +3453,9 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ - if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && - arc_anon->arcs_size > arc_c / 4) { + + if (reserve + arc_tempreserve + anon_size > arc_c / 2 && + anon_size > arc_c / 4) { dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", arc_tempreserve>>10, @@ -3385,6 +3520,15 @@ arc_init(void) if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; + if (zfs_arc_grow_retry > 0) + arc_grow_retry = zfs_arc_grow_retry; + + if (zfs_arc_shrink_shift > 0) + arc_shrink_shift = zfs_arc_shrink_shift; + + if (zfs_arc_p_min_shift > 0) + arc_p_min_shift = zfs_arc_p_min_shift; + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -3496,6 +3640,8 @@ arc_fini(void) mutex_destroy(&zfs_write_limit_lock); buf_fini(); + + ASSERT(arc_loaned_bytes == 0); } /* @@ -3623,8 +3769,70 @@ arc_fini(void) * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. + * + * There are three key functions that control how the L2ARC warms up: + * + * l2arc_write_eligible() check if a buffer is eligible to cache + * l2arc_write_size() calculate how much to write + * l2arc_write_interval() calculate sleep delay between writes + * + * These three functions determine what to write, how much, and how quickly + * to send writes. */ +static boolean_t +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) +{ + /* + * A buffer is *not* eligible for the L2ARC if it: + * 1. belongs to a different spa. + * 2. has no attached buffer. + * 3. is already cached on the L2ARC. + * 4. has an I/O in progress (it may be an incomplete read). + * 5. is flagged not eligible (zfs property). + */ + if (ab->b_spa != spa_guid || ab->b_buf == NULL || ab->b_l2hdr != NULL || + HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) + return (B_FALSE); + + return (B_TRUE); +} + +static uint64_t +l2arc_write_size(l2arc_dev_t *dev) +{ + uint64_t size; + + size = dev->l2ad_write; + + if (arc_warm == B_FALSE) + size += dev->l2ad_boost; + + return (size); + +} + +static clock_t +l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) +{ + clock_t interval, next; + + /* + * If the ARC lists are busy, increase our write rate; if the + * lists are stale, idle back. This is achieved by checking + * how much we previously wrote - if it was more than half of + * what we wanted, schedule the next write much sooner. + */ + if (l2arc_feed_again && wrote > (wanted / 2)) + interval = (hz * l2arc_feed_min_ms) / 1000; + else + interval = hz * l2arc_feed_secs; + + next = MAX(lbolt, MIN(lbolt + interval, began + interval)); + + return (next); +} + static void l2arc_hdr_stat_add(void) { @@ -3857,11 +4065,15 @@ l2arc_read_done(zio_t *zio) * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ - if (zio->io_waiter == NULL) - zio_nowait(zio_read(zio->io_parent, - cb->l2rcb_spa, &cb->l2rcb_bp, + if (zio->io_waiter == NULL) { + zio_t *pio = zio_unique_parent(zio); + + ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); + + zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, buf->b_data, zio->io_size, arc_read_done, buf, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + } } kmem_free(cb, sizeof (l2arc_read_callback_t)); @@ -4045,7 +4257,7 @@ top: * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. */ -static void +static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *ab, *ab_prev, *head; @@ -4057,6 +4269,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) boolean_t have_lock, full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; + uint64_t guid = spa_guid(spa); ASSERT(dev->l2ad_vdev != NULL); @@ -4110,20 +4323,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } - if (ab->b_spa != spa) { - mutex_exit(hash_lock); - continue; - } - - if (ab->b_l2hdr != NULL) { - /* - * Already in L2ARC. - */ - mutex_exit(hash_lock); - continue; - } - - if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) { + if (!l2arc_write_eligible(guid, ab)) { mutex_exit(hash_lock); continue; } @@ -4134,12 +4334,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } - if (ab->b_buf == NULL) { - DTRACE_PROBE1(l2arc__buf__null, void *, ab); - mutex_exit(hash_lock); - continue; - } - if (pio == NULL) { /* * Insert a dummy header on the buflist so @@ -4206,11 +4400,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) if (pio == NULL) { ASSERT3U(write_sz, ==, 0); kmem_cache_free(hdr_cache, head); - return; + return (0); } ASSERT3U(write_sz, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); + ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); ARCSTAT_INCR(arcstat_l2_size, write_sz); spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz); @@ -4226,7 +4421,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) dev->l2ad_first = B_FALSE; } + dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); + dev->l2ad_writing = B_FALSE; + + return (write_sz); } /* @@ -4239,20 +4438,19 @@ l2arc_feed_thread(void) callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; - uint64_t size; + uint64_t size, wrote; + clock_t begin, next = lbolt; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); while (l2arc_thread_exit == 0) { - /* - * Pause for l2arc_feed_secs seconds between writes. - */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, - lbolt + (hz * l2arc_feed_secs)); + next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); + next = lbolt + hz; /* * Quick check for L2ARC devices. @@ -4263,6 +4461,7 @@ l2arc_feed_thread(void) continue; } mutex_exit(&l2arc_dev_mtx); + begin = lbolt; /* * This selects the next l2arc device to write to, and in @@ -4291,9 +4490,7 @@ l2arc_feed_thread(void) ARCSTAT_BUMP(arcstat_l2_feeds); - size = dev->l2ad_write; - if (arc_warm == B_FALSE) - size += dev->l2ad_boost; + size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. @@ -4303,7 +4500,12 @@ l2arc_feed_thread(void) /* * Write ARC buffers. */ - l2arc_write_buffers(spa, dev, size); + wrote = l2arc_write_buffers(spa, dev, size); + + /* + * Calculate interval between writes. + */ + next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } @@ -4334,7 +4536,7 @@ l2arc_vdev_present(vdev_t *vd) * validated the vdev and opened it. */ void -l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) +l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; @@ -4348,11 +4550,12 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) adddev->l2ad_vdev = vd; adddev->l2ad_write = l2arc_write_max; adddev->l2ad_boost = l2arc_write_boost; - adddev->l2ad_start = start; - adddev->l2ad_end = end; + adddev->l2ad_start = VDEV_LABEL_START_SIZE; + adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; + adddev->l2ad_writing = B_FALSE; ASSERT3U(adddev->l2ad_write, >, 0); /*