Allow arc_evict_ghost() to only evict meta data
[zfs.git] / module / zfs / arc.c
index eab1b16..ce4a023 100644 (file)
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 /*
 
 #include <sys/spa.h>
 #include <sys/zio.h>
+#include <sys/zio_compress.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/vdev.h>
@@ -148,7 +150,7 @@ static kcondvar_t   arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 static uint8_t         arc_thread_exit;
 
 /* number of bytes to prune from caches when at arc_meta_limit is reached */
-uint_t arc_meta_prune = 1048576;
+int zfs_arc_meta_prune = 1048576;
 
 typedef enum arc_reclaim_strategy {
        ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
@@ -156,25 +158,31 @@ typedef enum arc_reclaim_strategy {
 } arc_reclaim_strategy_t;
 
 /* number of seconds before growing cache again */
-static int             arc_grow_retry = 5;
-
-/* expiration time for arc_no_grow */
-static clock_t         arc_grow_time = 0;
+int zfs_arc_grow_retry = 5;
 
 /* shift of arc_c for calculating both min and max arc_p */
-static int             arc_p_min_shift = 4;
+int zfs_arc_p_min_shift = 4;
 
 /* log2(fraction of arc to reclaim) */
-static int             arc_shrink_shift = 5;
+int zfs_arc_shrink_shift = 5;
 
 /*
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
-static int             arc_min_prefetch_lifespan;
+int zfs_arc_min_prefetch_lifespan = HZ;
+
+/* disable arc proactive arc throttle due to low memory */
+int zfs_arc_memory_throttle_disable = 1;
+
+/* disable duplicate buffer eviction */
+int zfs_disable_dup_eviction = 0;
 
 static int arc_dead;
 
+/* expiration time for arc_no_grow */
+static clock_t arc_grow_time = 0;
+
 /*
  * The arc has filled available memory and has now warmed up.
  */
@@ -186,11 +194,6 @@ static boolean_t arc_warm;
 unsigned long zfs_arc_max = 0;
 unsigned long zfs_arc_min = 0;
 unsigned long zfs_arc_meta_limit = 0;
-int zfs_arc_grow_retry = 0;
-int zfs_arc_shrink_shift = 0;
-int zfs_arc_p_min_shift = 0;
-int zfs_disable_dup_eviction = 0;
-int zfs_arc_meta_prune = 0;
 
 /*
  * Note that buffers can be in one of 6 states:
@@ -306,7 +309,11 @@ typedef struct arc_stats {
        kstat_named_t arcstat_l2_cksum_bad;
        kstat_named_t arcstat_l2_io_error;
        kstat_named_t arcstat_l2_size;
+       kstat_named_t arcstat_l2_asize;
        kstat_named_t arcstat_l2_hdr_size;
+       kstat_named_t arcstat_l2_compress_successes;
+       kstat_named_t arcstat_l2_compress_zeros;
+       kstat_named_t arcstat_l2_compress_failures;
        kstat_named_t arcstat_memory_throttle_count;
        kstat_named_t arcstat_duplicate_buffers;
        kstat_named_t arcstat_duplicate_buffers_size;
@@ -389,7 +396,11 @@ static arc_stats_t arc_stats = {
        { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
        { "l2_io_error",                KSTAT_DATA_UINT64 },
        { "l2_size",                    KSTAT_DATA_UINT64 },
+       { "l2_asize",                   KSTAT_DATA_UINT64 },
        { "l2_hdr_size",                KSTAT_DATA_UINT64 },
+       { "l2_compress_successes",      KSTAT_DATA_UINT64 },
+       { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
+       { "l2_compress_failures",       KSTAT_DATA_UINT64 },
        { "memory_throttle_count",      KSTAT_DATA_UINT64 },
        { "duplicate_buffers",          KSTAT_DATA_UINT64 },
        { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
@@ -471,6 +482,9 @@ static arc_state_t  *arc_l2c_only;
 #define        arc_meta_limit  ARCSTAT(arcstat_meta_limit)
 #define        arc_meta_max    ARCSTAT(arcstat_meta_max)
 
+#define        L2ARC_IS_VALID_COMPRESS(_c_) \
+       ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
+
 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 
 typedef struct arc_callback arc_callback_t;
@@ -500,7 +514,6 @@ struct arc_buf_hdr {
 
        kmutex_t                b_freeze_lock;
        zio_cksum_t             *b_freeze_cksum;
-       void                    *b_thawed;
 
        arc_buf_hdr_t           *b_hash_next;
        arc_buf_t               *b_buf;
@@ -537,7 +550,8 @@ static arc_buf_hdr_t arc_eviction_hdr;
 static void arc_get_data_buf(arc_buf_t *buf);
 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 static int arc_evict_needed(arc_buf_contents_t type);
-static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
+static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type);
 
 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 
@@ -622,7 +636,12 @@ uint64_t zfs_crc64_table[256];
  */
 
 #define        L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
-#define        L2ARC_HEADROOM          2               /* num of writes */
+#define        L2ARC_HEADROOM          2                       /* num of writes */
+/*
+ * If we discover during ARC scan any buffers to be compressed, we boost
+ * our headroom for the next scanning cycle by this percentage multiple.
+ */
+#define        L2ARC_HEADROOM_BOOST    200
 #define        L2ARC_FEED_SECS         1               /* caching interval secs */
 #define        L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 
@@ -635,11 +654,13 @@ uint64_t zfs_crc64_table[256];
 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;      /* def max write size */
 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;    /* extra warmup write */
 unsigned long l2arc_headroom = L2ARC_HEADROOM;         /* # of dev writes */
+unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;       /* interval seconds */
 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;   /* min interval msecs */
 int l2arc_noprefetch = B_TRUE;                 /* don't cache prefetch bufs */
+int l2arc_nocompress = B_FALSE;                        /* don't compress bufs */
 int l2arc_feed_again = B_TRUE;                 /* turbo warmup */
-int l2arc_norw = B_TRUE;                       /* no reads during writes */
+int l2arc_norw = B_FALSE;                      /* no reads during writes */
 
 /*
  * L2ARC Internals
@@ -648,8 +669,6 @@ typedef struct l2arc_dev {
        vdev_t                  *l2ad_vdev;     /* vdev */
        spa_t                   *l2ad_spa;      /* spa */
        uint64_t                l2ad_hand;      /* next write location */
-       uint64_t                l2ad_write;     /* desired write size, bytes */
-       uint64_t                l2ad_boost;     /* warmup write boost, bytes */
        uint64_t                l2ad_start;     /* first addr on device */
        uint64_t                l2ad_end;       /* last addr on device */
        uint64_t                l2ad_evict;     /* last addr eviction reached */
@@ -670,11 +689,12 @@ static kmutex_t l2arc_free_on_write_mtx;  /* mutex for list */
 static uint64_t l2arc_ndev;                    /* number of devices */
 
 typedef struct l2arc_read_callback {
-       arc_buf_t       *l2rcb_buf;             /* read buffer */
-       spa_t           *l2rcb_spa;             /* spa */
-       blkptr_t        l2rcb_bp;               /* original blkptr */
-       zbookmark_t     l2rcb_zb;               /* original bookmark */
-       int             l2rcb_flags;            /* original flags */
+       arc_buf_t               *l2rcb_buf;             /* read buffer */
+       spa_t                   *l2rcb_spa;             /* spa */
+       blkptr_t                l2rcb_bp;               /* original blkptr */
+       zbookmark_t             l2rcb_zb;               /* original bookmark */
+       int                     l2rcb_flags;            /* original flags */
+       enum zio_compress       l2rcb_compress;         /* applied compress */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_write_callback {
@@ -684,8 +704,14 @@ typedef struct l2arc_write_callback {
 
 struct l2arc_buf_hdr {
        /* protected by arc_buf_hdr  mutex */
-       l2arc_dev_t     *b_dev;                 /* L2ARC device */
-       uint64_t        b_daddr;                /* disk address, offset byte */
+       l2arc_dev_t             *b_dev;         /* L2ARC device */
+       uint64_t                b_daddr;        /* disk address, offset byte */
+       /* compression applied to buffer data */
+       enum zio_compress       b_compress;
+       /* real alloc'd buffer size depending on b_compress applied */
+       int                     b_asize;
+       /* temporary buffer holder for in-flight compressed data */
+       void                    *b_tmp_cdata;
 };
 
 typedef struct l2arc_data_free {
@@ -704,6 +730,11 @@ static void l2arc_read_done(zio_t *zio);
 static void l2arc_hdr_stat_add(void);
 static void l2arc_hdr_stat_remove(void);
 
+static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
+static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
+    enum zio_compress c);
+static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
+
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
@@ -883,7 +914,6 @@ buf_cons(void *vbuf, void *unused, int kmflag)
 
        bzero(buf, sizeof (arc_buf_t));
        mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
-       rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
        arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
        return (0);
@@ -913,7 +943,6 @@ buf_dest(void *vbuf, void *unused)
        arc_buf_t *buf = vbuf;
 
        mutex_destroy(&buf->b_evict_lock);
-       rw_destroy(&buf->b_data_lock);
        arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
@@ -1034,12 +1063,6 @@ arc_buf_thaw(arc_buf_t *buf)
                buf->b_hdr->b_freeze_cksum = NULL;
        }
 
-       if (zfs_flags & ZFS_DEBUG_MODIFY) {
-               if (buf->b_hdr->b_thawed)
-                       kmem_free(buf->b_hdr->b_thawed, 1);
-               buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
-       }
-
        mutex_exit(&buf->b_hdr->b_freeze_lock);
 }
 
@@ -1076,7 +1099,7 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
                ASSERT(list_link_active(&ab->b_arc_node));
                list_remove(list, ab);
                if (GHOST_STATE(ab->b_state)) {
-                       ASSERT3U(ab->b_datacnt, ==, 0);
+                       ASSERT0(ab->b_datacnt);
                        ASSERT3P(ab->b_buf, ==, NULL);
                        delta = ab->b_size;
                }
@@ -1261,23 +1284,6 @@ arc_space_return(uint64_t space, arc_space_type_t type)
        atomic_add_64(&arc_size, -space);
 }
 
-void *
-arc_data_buf_alloc(uint64_t size)
-{
-       if (arc_evict_needed(ARC_BUFC_DATA))
-               cv_signal(&arc_reclaim_thr_cv);
-       atomic_add_64(&arc_size, size);
-       return (zio_data_buf_alloc(size));
-}
-
-void
-arc_data_buf_free(void *buf, uint64_t size)
-{
-       zio_data_buf_free(buf, size);
-       ASSERT(arc_size >= size);
-       atomic_add_64(&arc_size, -size);
-}
-
 arc_buf_t *
 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
 {
@@ -1545,7 +1551,9 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
                if (l2hdr != NULL) {
                        list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
                        ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+                       ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
                        kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+                       arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
                        if (hdr->b_state == arc_l2c_only)
                                l2arc_hdr_stat_remove();
                        hdr->b_l2hdr = NULL;
@@ -1581,10 +1589,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
                kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
                hdr->b_freeze_cksum = NULL;
        }
-       if (hdr->b_thawed) {
-               kmem_free(hdr->b_thawed, 1);
-               hdr->b_thawed = NULL;
-       }
 
        ASSERT(!list_link_active(&hdr->b_arc_node));
        ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1643,7 +1647,7 @@ int
 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 {
        arc_buf_hdr_t *hdr = buf->b_hdr;
-       kmutex_t *hash_lock = HDR_LOCK(hdr);
+       kmutex_t *hash_lock = NULL;
        int no_callback = (buf->b_efunc == NULL);
 
        if (hdr->b_state == arc_anon) {
@@ -1652,6 +1656,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
                return (no_callback);
        }
 
+       hash_lock = HDR_LOCK(hdr);
        mutex_enter(hash_lock);
        hdr = buf->b_hdr;
        ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
@@ -1760,7 +1765,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                    (spa && ab->b_spa != spa) ||
                    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
                    ddi_get_lbolt() - ab->b_arc_access <
-                   arc_min_prefetch_lifespan)) {
+                   zfs_arc_min_prefetch_lifespan)) {
                        skipped++;
                        continue;
                }
@@ -1771,7 +1776,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                hash_lock = HDR_LOCK(ab);
                have_lock = MUTEX_HELD(hash_lock);
                if (have_lock || mutex_tryenter(hash_lock)) {
-                       ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+                       ASSERT0(refcount_count(&ab->b_refcnt));
                        ASSERT(ab->b_datacnt > 0);
                        while (ab->b_buf) {
                                arc_buf_t *buf = ab->b_buf;
@@ -1860,12 +1865,14 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
                        int64_t todelete =
                            MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-                       arc_evict_ghost(arc_mru_ghost, 0, todelete);
+                       arc_evict_ghost(arc_mru_ghost, 0, todelete,
+                           ARC_BUFC_DATA);
                } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
                        int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
                            arc_mru_ghost->arcs_size +
                            arc_mfu_ghost->arcs_size - arc_c);
-                       arc_evict_ghost(arc_mfu_ghost, 0, todelete);
+                       arc_evict_ghost(arc_mfu_ghost, 0, todelete,
+                           ARC_BUFC_DATA);
                }
        }
 
@@ -1877,11 +1884,12 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
  * bytes.  Destroy the buffers that are removed.
  */
 static void
-arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
 {
        arc_buf_hdr_t *ab, *ab_prev;
        arc_buf_hdr_t marker;
-       list_t *list = &state->arcs_list[ARC_BUFC_DATA];
+       list_t *list = &state->arcs_list[type];
        kmutex_t *hash_lock;
        uint64_t bytes_deleted = 0;
        uint64_t bufs_skipped = 0;
@@ -2011,7 +2019,7 @@ arc_adjust(void)
 
        if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
                delta = MIN(arc_mru_ghost->arcs_size, adjustment);
-               arc_evict_ghost(arc_mru_ghost, 0, delta);
+               arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_DATA);
        }
 
        adjustment =
@@ -2019,7 +2027,7 @@ arc_adjust(void)
 
        if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
                delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
-               arc_evict_ghost(arc_mfu_ghost, 0, delta);
+               arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_DATA);
        }
 }
 
@@ -2111,7 +2119,7 @@ arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
        }
 
        if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
-               arc_do_user_prune(arc_meta_prune);
+               arc_do_user_prune(zfs_arc_meta_prune);
 }
 
 /*
@@ -2147,8 +2155,8 @@ arc_flush(spa_t *spa)
                        break;
        }
 
-       arc_evict_ghost(arc_mru_ghost, guid, -1);
-       arc_evict_ghost(arc_mfu_ghost, guid, -1);
+       arc_evict_ghost(arc_mru_ghost, guid, -1, ARC_BUFC_DATA);
+       arc_evict_ghost(arc_mfu_ghost, guid, -1, ARC_BUFC_DATA);
 
        mutex_enter(&arc_reclaim_thr_lock);
        arc_do_user_evicts();
@@ -2162,14 +2170,14 @@ arc_shrink(uint64_t bytes)
        if (arc_c > arc_c_min) {
                uint64_t to_free;
 
-               to_free = bytes ? bytes : arc_c >> arc_shrink_shift;
+               to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift;
 
                if (arc_c > arc_c_min + to_free)
                        atomic_add_64(&arc_c, -to_free);
                else
                        arc_c = arc_c_min;
 
-               atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
+               atomic_add_64(&arc_p, -(arc_p >> zfs_arc_shrink_shift));
                if (arc_c > arc_size)
                        arc_c = MAX(arc_size, arc_c_min);
                if (arc_p > arc_c)
@@ -2248,7 +2256,7 @@ arc_adapt_thread(void)
                        }
 
                        /* reset the growth delay for every reclaim */
-                       arc_grow_time = ddi_get_lbolt()+(arc_grow_retry * hz);
+                       arc_grow_time = ddi_get_lbolt()+(zfs_arc_grow_retry * hz);
 
                        arc_kmem_reap_now(last_reclaim, 0);
                        arc_warm = B_TRUE;
@@ -2278,6 +2286,26 @@ arc_adapt_thread(void)
                (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
                    &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
                CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+
+
+               /* Allow the module options to be changed */
+               if (zfs_arc_max > 64 << 20 &&
+                   zfs_arc_max < physmem * PAGESIZE &&
+                   zfs_arc_max != arc_c_max)
+                       arc_c_max = zfs_arc_max;
+
+               if (zfs_arc_min > 0 &&
+                   zfs_arc_min < arc_c_max &&
+                   zfs_arc_min != arc_c_min)
+                       arc_c_min = zfs_arc_min;
+
+               if (zfs_arc_meta_limit > 0 &&
+                   zfs_arc_meta_limit <= arc_c_max &&
+                   zfs_arc_meta_limit != arc_meta_limit)
+                       arc_meta_limit = zfs_arc_meta_limit;
+
+
+
        }
 
        arc_thread_exit = 0;
@@ -2381,10 +2409,8 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
         */
        if (pages > 0) {
                arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
-               pages = btop(arc_evictable_memory());
        } else {
                arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
-               pages = -1;
        }
 
        /*
@@ -2398,13 +2424,13 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
                ARCSTAT_BUMP(arcstat_memory_indirect_count);
        } else {
                arc_no_grow = B_TRUE;
-               arc_grow_time = ddi_get_lbolt() + (arc_grow_retry * hz);
+               arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
                ARCSTAT_BUMP(arcstat_memory_direct_count);
        }
 
        mutex_exit(&arc_reclaim_thr_lock);
 
-       return (pages);
+       return (-1);
 }
 SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
 
@@ -2420,7 +2446,7 @@ static void
 arc_adapt(int bytes, arc_state_t *state)
 {
        int mult;
-       uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+       uint64_t arc_p_min = (arc_c >> zfs_arc_p_min_shift);
 
        if (state == arc_l2c_only)
                return;
@@ -2717,7 +2743,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                         * This is a prefetch access...
                         * move this block back to the MRU state.
                         */
-                       ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
+                       ASSERT0(refcount_count(&buf->b_refcnt));
                        new_state = arc_mru;
                }
 
@@ -2801,10 +2827,10 @@ arc_read_done(zio_t *zio)
        if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
                dmu_object_byteswap_t bswap =
                    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
-               arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
-                   byteswap_uint64_array :
-                   dmu_ot_byteswap[bswap].ob_func;
-               func(buf->b_data, hdr->b_size);
+               if (BP_GET_LEVEL(zio->io_bp) > 0)
+                   byteswap_uint64_array(buf->b_data, hdr->b_size);
+               else
+                   dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
        }
 
        arc_cksum_compute(buf, B_FALSE);
@@ -2906,42 +2932,11 @@ arc_read_done(zio_t *zio)
  *
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
- *
- * Normal callers should use arc_read and pass the arc buffer and offset
- * for the bp.  But if you know you don't need locking, you can use
- * arc_read_bp.
  */
 int
-arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
-    arc_done_func_t *done, void *private, int priority, int zio_flags,
-    uint32_t *arc_flags, const zbookmark_t *zb)
-{
-       int err;
-
-       if (pbuf == NULL) {
-               /*
-                * XXX This happens from traverse callback funcs, for
-                * the objset_phys_t block.
-                */
-               return (arc_read_nolock(pio, spa, bp, done, private, priority,
-                   zio_flags, arc_flags, zb));
-       }
-
-       ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
-       ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
-       rw_enter(&pbuf->b_data_lock, RW_READER);
-
-       err = arc_read_nolock(pio, spa, bp, done, private, priority,
-           zio_flags, arc_flags, zb);
-       rw_exit(&pbuf->b_data_lock);
-
-       return (err);
-}
-
-int
-arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
-    arc_done_func_t *done, void *private, int priority, int zio_flags,
-    uint32_t *arc_flags, const zbookmark_t *zb)
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
+    void *private, int priority, int zio_flags, uint32_t *arc_flags,
+    const zbookmark_t *zb)
 {
        arc_buf_hdr_t *hdr;
        arc_buf_t *buf = NULL;
@@ -3014,6 +3009,8 @@ top:
                arc_access(hdr, hash_lock);
                if (*arc_flags & ARC_L2CACHE)
                        hdr->b_flags |= ARC_L2CACHE;
+               if (*arc_flags & ARC_L2COMPRESS)
+                       hdr->b_flags |= ARC_L2COMPRESS;
                mutex_exit(hash_lock);
                ARCSTAT_BUMP(arcstat_hits);
                ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
@@ -3054,13 +3051,15 @@ top:
                        }
                        if (*arc_flags & ARC_L2CACHE)
                                hdr->b_flags |= ARC_L2CACHE;
+                       if (*arc_flags & ARC_L2COMPRESS)
+                               hdr->b_flags |= ARC_L2COMPRESS;
                        if (BP_GET_LEVEL(bp) > 0)
                                hdr->b_flags |= ARC_INDIRECT;
                } else {
                        /* this block is in the ghost cache */
                        ASSERT(GHOST_STATE(hdr->b_state));
                        ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-                       ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
+                       ASSERT0(refcount_count(&hdr->b_refcnt));
                        ASSERT(hdr->b_buf == NULL);
 
                        /* if this is a prefetch, we don't have a reference */
@@ -3070,6 +3069,8 @@ top:
                                add_reference(hdr, hash_lock, private);
                        if (*arc_flags & ARC_L2CACHE)
                                hdr->b_flags |= ARC_L2CACHE;
+                       if (*arc_flags & ARC_L2COMPRESS)
+                               hdr->b_flags |= ARC_L2COMPRESS;
                        buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
                        buf->b_hdr = hdr;
                        buf->b_data = NULL;
@@ -3140,20 +3141,36 @@ top:
                                cb->l2rcb_bp = *bp;
                                cb->l2rcb_zb = *zb;
                                cb->l2rcb_flags = zio_flags;
+                               cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
 
                                /*
                                 * l2arc read.  The SCL_L2ARC lock will be
                                 * released by l2arc_read_done().
+                                * Issue a null zio if the underlying buffer
+                                * was squashed to zero size by compression.
                                 */
-                               rzio = zio_read_phys(pio, vd, addr, size,
-                                   buf->b_data, ZIO_CHECKSUM_OFF,
-                                   l2arc_read_done, cb, priority, zio_flags |
-                                   ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
-                                   ZIO_FLAG_DONT_PROPAGATE |
-                                   ZIO_FLAG_DONT_RETRY, B_FALSE);
+                               if (hdr->b_l2hdr->b_compress ==
+                                   ZIO_COMPRESS_EMPTY) {
+                                       rzio = zio_null(pio, spa, vd,
+                                           l2arc_read_done, cb,
+                                           zio_flags | ZIO_FLAG_DONT_CACHE |
+                                           ZIO_FLAG_CANFAIL |
+                                           ZIO_FLAG_DONT_PROPAGATE |
+                                           ZIO_FLAG_DONT_RETRY);
+                               } else {
+                                       rzio = zio_read_phys(pio, vd, addr,
+                                           hdr->b_l2hdr->b_asize,
+                                           buf->b_data, ZIO_CHECKSUM_OFF,
+                                           l2arc_read_done, cb, priority,
+                                           zio_flags | ZIO_FLAG_DONT_CACHE |
+                                           ZIO_FLAG_CANFAIL |
+                                           ZIO_FLAG_DONT_PROPAGATE |
+                                           ZIO_FLAG_DONT_RETRY, B_FALSE);
+                               }
                                DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
                                    zio_t *, rzio);
-                               ARCSTAT_INCR(arcstat_l2_read_bytes, size);
+                               ARCSTAT_INCR(arcstat_l2_read_bytes,
+                                   hdr->b_l2hdr->b_asize);
 
                                if (*arc_flags & ARC_NOWAIT) {
                                        zio_nowait(rzio);
@@ -3240,6 +3257,34 @@ arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
 }
 
 /*
+ * Notify the arc that a block was freed, and thus will never be used again.
+ */
+void
+arc_freed(spa_t *spa, const blkptr_t *bp)
+{
+       arc_buf_hdr_t *hdr;
+       kmutex_t *hash_lock;
+       uint64_t guid = spa_load_guid(spa);
+
+       hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+           &hash_lock);
+       if (hdr == NULL)
+               return;
+       if (HDR_BUF_AVAILABLE(hdr)) {
+               arc_buf_t *buf = hdr->b_buf;
+               add_reference(hdr, hash_lock, FTAG);
+               hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+               mutex_exit(hash_lock);
+
+               arc_release(buf, FTAG);
+               (void) arc_buf_remove_ref(buf, FTAG);
+       } else {
+               mutex_exit(hash_lock);
+       }
+
+}
+
+/*
  * This is used by the DMU to let the ARC know that a buffer is
  * being evicted, so the ARC should clean up.  If this arc buf
  * is not yet in the evicted state, it will be put there.
@@ -3444,26 +3489,15 @@ arc_release(arc_buf_t *buf, void *tag)
        buf->b_private = NULL;
 
        if (l2hdr) {
+               ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
                list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
                kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+               arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
                ARCSTAT_INCR(arcstat_l2_size, -buf_size);
                mutex_exit(&l2arc_buflist_mtx);
        }
 }
 
-/*
- * Release this buffer.  If it does not match the provided BP, fill it
- * with that block's contents.
- */
-/* ARGSUSED */
-int
-arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
-    zbookmark_t *zb)
-{
-       arc_release(buf, tag);
-       return (0);
-}
-
 int
 arc_released(arc_buf_t *buf)
 {
@@ -3600,9 +3634,9 @@ arc_write_done(zio_t *zio)
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
-    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private,
-    int priority, int zio_flags, const zbookmark_t *zb)
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
+    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
+    void *private, int priority, int zio_flags, const zbookmark_t *zb)
 {
        arc_buf_hdr_t *hdr = buf->b_hdr;
        arc_write_callback_t *callback;
@@ -3615,6 +3649,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
        ASSERT(hdr->b_acb == NULL);
        if (l2arc)
                hdr->b_flags |= ARC_L2CACHE;
+       if (l2arc_compress)
+               hdr->b_flags |= ARC_L2COMPRESS;
        callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE);
        callback->awcb_ready = ready;
        callback->awcb_done = done;
@@ -3633,6 +3669,9 @@ arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
 #ifdef _KERNEL
        uint64_t available_memory;
 
+       if (zfs_arc_memory_throttle_disable)
+               return (0);
+
        /* Easily reclaimable memory (free + inactive + arc-evictable) */
        available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();
 
@@ -3767,7 +3806,7 @@ arc_init(void)
        cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
 
        /* Convert seconds to clock ticks */
-       arc_min_prefetch_lifespan = 1 * hz;
+       zfs_arc_min_prefetch_lifespan = 1 * hz;
 
        /* Start out with 1/8 of all memory */
        arc_c = physmem * PAGESIZE / 8;
@@ -3815,18 +3854,6 @@ arc_init(void)
        if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
                arc_c_min = arc_meta_limit / 2;
 
-       if (zfs_arc_grow_retry > 0)
-               arc_grow_retry = zfs_arc_grow_retry;
-
-       if (zfs_arc_shrink_shift > 0)
-               arc_shrink_shift = zfs_arc_shrink_shift;
-
-       if (zfs_arc_p_min_shift > 0)
-               arc_p_min_shift = zfs_arc_p_min_shift;
-
-       if (zfs_arc_meta_prune > 0)
-               arc_meta_prune = zfs_arc_meta_prune;
-
        /* if kmem_flags are set, lets try to use less memory */
        if (kmem_debugging())
                arc_c = arc_c / 2;
@@ -4018,8 +4045,12 @@ arc_fini(void)
  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
  * It does this by periodically scanning buffers from the eviction-end of
  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
- * not already there.  It scans until a headroom of buffers is satisfied,
- * which itself is a buffer for ARC eviction.  The thread that does this is
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. If a compressible buffer is
+ * found during scanning and selected for writing to an L2ARC device, we
+ * temporarily boost scanning headroom during the next scan cycle to make
+ * sure we adapt to compression effects (which might significantly reduce
+ * the data volume we write to L2ARC). The thread that does this is
  * l2arc_feed_thread(), illustrated below; example sizes are included to
  * provide a better sense of ratio than this diagram:
  *
@@ -4083,7 +4114,13 @@ arc_fini(void)
  *     l2arc_write_max         max write bytes per interval
  *     l2arc_write_boost       extra write bytes during device warmup
  *     l2arc_noprefetch        skip caching prefetched buffers
+ *     l2arc_nocompress        skip compressing buffers
  *     l2arc_headroom          number of max device writes to precache
+ *     l2arc_headroom_boost    when we find compressed buffers during ARC
+ *                             scanning, we multiply headroom by this
+ *                             percentage factor for the next scan cycle,
+ *                             since more compressed buffers are likely to
+ *                             be present
  *     l2arc_feed_secs         seconds between L2ARC writing
  *
  * Tunables may be removed or added as future performance improvements are
@@ -4117,14 +4154,24 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
 }
 
 static uint64_t
-l2arc_write_size(l2arc_dev_t *dev)
+l2arc_write_size(void)
 {
        uint64_t size;
 
-       size = dev->l2ad_write;
+       /*
+        * Make sure our globals have meaningful values in case the user
+        * altered them.
+        */
+       size = l2arc_write_max;
+       if (size == 0) {
+               cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
+                   "be greater than zero, resetting it to the default (%d)",
+                   L2ARC_WRITE_SIZE);
+               size = l2arc_write_max = L2ARC_WRITE_SIZE;
+       }
 
        if (arc_warm == B_FALSE)
-               size += dev->l2ad_boost;
+               size += l2arc_write_boost;
 
        return (size);
 
@@ -4155,14 +4202,14 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 static void
 l2arc_hdr_stat_add(void)
 {
-       ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+       ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE);
        ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
 }
 
 static void
 l2arc_hdr_stat_remove(void)
 {
-       ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
+       ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE);
        ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
 }
 
@@ -4298,14 +4345,23 @@ l2arc_write_done(zio_t *zio)
                        continue;
                }
 
+               abl2 = ab->b_l2hdr;
+
+               /*
+                * Release the temporary compressed buffer as soon as possible.
+                */
+               if (abl2->b_compress != ZIO_COMPRESS_OFF)
+                       l2arc_release_cdata_buf(ab);
+
                if (zio->io_error != 0) {
                        /*
                         * Error - drop L2ARC entry.
                         */
                        list_remove(buflist, ab);
-                       abl2 = ab->b_l2hdr;
+                       ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
                        ab->b_l2hdr = NULL;
                        kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+                       arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
                        ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
                }
 
@@ -4356,6 +4412,13 @@ l2arc_read_done(zio_t *zio)
        ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
        /*
+        * If the buffer was compressed, decompress it first.
+        */
+       if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
+               l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
+       ASSERT(zio->io_data != NULL);
+
+       /*
         * Check this survived the L2ARC journey.
         */
        equal = arc_cksum_equal(buf);
@@ -4550,8 +4613,10 @@ top:
                         */
                        if (ab->b_l2hdr != NULL) {
                                abl2 = ab->b_l2hdr;
+                               ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
                                ab->b_l2hdr = NULL;
                                kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+                               arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
                                ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
                        }
                        list_remove(buflist, ab);
@@ -4575,37 +4640,54 @@ top:
  *
  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
+ * The headroom_boost is an in-out parameter used to maintain headroom boost
+ * state between calls to this function.
+ *
+ * Returns the number of bytes actually written (which may be smaller than
+ * the delta by which the device hand has changed due to alignment).
  */
 static uint64_t
-l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
+    boolean_t *headroom_boost)
 {
        arc_buf_hdr_t *ab, *ab_prev, *head;
-       l2arc_buf_hdr_t *hdrl2;
        list_t *list;
-       uint64_t passed_sz, write_sz, buf_sz, headroom;
+       uint64_t write_asize, write_psize, write_sz, headroom,
+           buf_compress_minsz;
        void *buf_data;
-       kmutex_t *hash_lock, *list_lock = NULL;
-       boolean_t have_lock, full;
+       kmutex_t *list_lock = NULL;
+       boolean_t full;
        l2arc_write_callback_t *cb;
        zio_t *pio, *wzio;
        uint64_t guid = spa_load_guid(spa);
        int try;
+       const boolean_t do_headroom_boost = *headroom_boost;
 
        ASSERT(dev->l2ad_vdev != NULL);
 
+       /* Lower the flag now, we might want to raise it again later. */
+       *headroom_boost = B_FALSE;
+
        pio = NULL;
-       write_sz = 0;
+       write_sz = write_asize = write_psize = 0;
        full = B_FALSE;
        head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
        head->b_flags |= ARC_L2_WRITE_HEAD;
 
        /*
+        * We will want to try to compress buffers that are at least 2x the
+        * device sector size.
+        */
+       buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
+
+       /*
         * Copy buffers for L2ARC writing.
         */
        mutex_enter(&l2arc_buflist_mtx);
        for (try = 0; try <= 3; try++) {
+               uint64_t passed_sz = 0;
+
                list = l2arc_list_locked(try, &list_lock);
-               passed_sz = 0;
 
                /*
                 * L2ARC fast warmup.
@@ -4613,21 +4695,27 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                 * Until the ARC is warm and starts to evict, read from the
                 * head of the ARC lists rather than the tail.
                 */
-               headroom = target_sz * l2arc_headroom;
                if (arc_warm == B_FALSE)
                        ab = list_head(list);
                else
                        ab = list_tail(list);
 
+               headroom = target_sz * l2arc_headroom;
+               if (do_headroom_boost)
+                       headroom = (headroom * l2arc_headroom_boost) / 100;
+
                for (; ab; ab = ab_prev) {
+                       l2arc_buf_hdr_t *l2hdr;
+                       kmutex_t *hash_lock;
+                       uint64_t buf_sz;
+
                        if (arc_warm == B_FALSE)
                                ab_prev = list_next(list, ab);
                        else
                                ab_prev = list_prev(list, ab);
 
                        hash_lock = HDR_LOCK(ab);
-                       have_lock = MUTEX_HELD(hash_lock);
-                       if (!have_lock && !mutex_tryenter(hash_lock)) {
+                       if (!mutex_tryenter(hash_lock)) {
                                /*
                                 * Skip this buffer rather than waiting.
                                 */
@@ -4673,16 +4761,29 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                        /*
                         * Create and add a new L2ARC header.
                         */
-                       hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
-                                           KM_PUSHPAGE);
-                       hdrl2->b_dev = dev;
-                       hdrl2->b_daddr = dev->l2ad_hand;
+                       l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
+                           KM_PUSHPAGE);
+                       l2hdr->b_dev = dev;
+                       arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS);
 
                        ab->b_flags |= ARC_L2_WRITING;
-                       ab->b_l2hdr = hdrl2;
-                       list_insert_head(dev->l2ad_buflist, ab);
-                       buf_data = ab->b_buf->b_data;
+
+                       /*
+                        * Temporarily stash the data buffer in b_tmp_cdata.
+                        * The subsequent write step will pick it up from
+                        * there. This is because can't access ab->b_buf
+                        * without holding the hash_lock, which we in turn
+                        * can't access without holding the ARC list locks
+                        * (which we want to avoid during compression/writing)
+                        */
+                       l2hdr->b_compress = ZIO_COMPRESS_OFF;
+                       l2hdr->b_asize = ab->b_size;
+                       l2hdr->b_tmp_cdata = ab->b_buf->b_data;
+
                        buf_sz = ab->b_size;
+                       ab->b_l2hdr = l2hdr;
+
+                       list_insert_head(dev->l2ad_buflist, ab);
 
                        /*
                         * Compute and store the buffer cksum before
@@ -4693,6 +4794,64 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 
                        mutex_exit(hash_lock);
 
+                       write_sz += buf_sz;
+               }
+
+               mutex_exit(list_lock);
+
+               if (full == B_TRUE)
+                       break;
+       }
+
+       /* No buffers selected for writing? */
+       if (pio == NULL) {
+               ASSERT0(write_sz);
+               mutex_exit(&l2arc_buflist_mtx);
+               kmem_cache_free(hdr_cache, head);
+               return (0);
+       }
+
+       /*
+        * Now start writing the buffers. We're starting at the write head
+        * and work backwards, retracing the course of the buffer selector
+        * loop above.
+        */
+       for (ab = list_prev(dev->l2ad_buflist, head); ab;
+           ab = list_prev(dev->l2ad_buflist, ab)) {
+               l2arc_buf_hdr_t *l2hdr;
+               uint64_t buf_sz;
+
+               /*
+                * We shouldn't need to lock the buffer here, since we flagged
+                * it as ARC_L2_WRITING in the previous step, but we must take
+                * care to only access its L2 cache parameters. In particular,
+                * ab->b_buf may be invalid by now due to ARC eviction.
+                */
+               l2hdr = ab->b_l2hdr;
+               l2hdr->b_daddr = dev->l2ad_hand;
+
+               if (!l2arc_nocompress && (ab->b_flags & ARC_L2COMPRESS) &&
+                   l2hdr->b_asize >= buf_compress_minsz) {
+                       if (l2arc_compress_buf(l2hdr)) {
+                               /*
+                                * If compression succeeded, enable headroom
+                                * boost on the next scan cycle.
+                                */
+                               *headroom_boost = B_TRUE;
+                       }
+               }
+
+               /*
+                * Pick up the buffer data we had previously stashed away
+                * (and now potentially also compressed).
+                */
+               buf_data = l2hdr->b_tmp_cdata;
+               buf_sz = l2hdr->b_asize;
+
+               /* Compression may have squashed the buffer to zero length. */
+               if (buf_sz != 0) {
+                       uint64_t buf_p_sz;
+
                        wzio = zio_write_phys(pio, dev->l2ad_vdev,
                            dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
                            NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
@@ -4702,33 +4861,24 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                            zio_t *, wzio);
                        (void) zio_nowait(wzio);
 
+                       write_asize += buf_sz;
                        /*
                         * Keep the clock hand suitably device-aligned.
                         */
-                       buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
-
-                       write_sz += buf_sz;
-                       dev->l2ad_hand += buf_sz;
+                       buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+                       write_psize += buf_p_sz;
+                       dev->l2ad_hand += buf_p_sz;
                }
-
-               mutex_exit(list_lock);
-
-               if (full == B_TRUE)
-                       break;
        }
-       mutex_exit(&l2arc_buflist_mtx);
 
-       if (pio == NULL) {
-               ASSERT3U(write_sz, ==, 0);
-               kmem_cache_free(hdr_cache, head);
-               return (0);
-       }
+       mutex_exit(&l2arc_buflist_mtx);
 
-       ASSERT3U(write_sz, <=, target_sz);
+       ASSERT3U(write_asize, <=, target_sz);
        ARCSTAT_BUMP(arcstat_l2_writes_sent);
-       ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
+       ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
        ARCSTAT_INCR(arcstat_l2_size, write_sz);
-       vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
+       ARCSTAT_INCR(arcstat_l2_asize, write_asize);
+       vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
 
        /*
         * Bump device hand to the device start if it is approaching the end.
@@ -4746,7 +4896,153 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
        (void) zio_wait(pio);
        dev->l2ad_writing = B_FALSE;
 
-       return (write_sz);
+       return (write_asize);
+}
+
+/*
+ * Compresses an L2ARC buffer.
+ * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
+ * size in l2hdr->b_asize. This routine tries to compress the data and
+ * depending on the compression result there are three possible outcomes:
+ * *) The buffer was incompressible. The original l2hdr contents were left
+ *    untouched and are ready for writing to an L2 device.
+ * *) The buffer was all-zeros, so there is no need to write it to an L2
+ *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
+ *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
+ * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
+ *    data buffer which holds the compressed data to be written, and b_asize
+ *    tells us how much data there is. b_compress is set to the appropriate
+ *    compression algorithm. Once writing is done, invoke
+ *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
+ *
+ * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
+ * buffer was incompressible).
+ */
+static boolean_t
+l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
+{
+       void *cdata;
+       size_t csize, len;
+
+       ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
+       ASSERT(l2hdr->b_tmp_cdata != NULL);
+
+       len = l2hdr->b_asize;
+       cdata = zio_data_buf_alloc(len);
+       csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
+           cdata, l2hdr->b_asize);
+
+       if (csize == 0) {
+               /* zero block, indicate that there's nothing to write */
+               zio_data_buf_free(cdata, len);
+               l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
+               l2hdr->b_asize = 0;
+               l2hdr->b_tmp_cdata = NULL;
+               ARCSTAT_BUMP(arcstat_l2_compress_zeros);
+               return (B_TRUE);
+       } else if (csize > 0 && csize < len) {
+               /*
+                * Compression succeeded, we'll keep the cdata around for
+                * writing and release it afterwards.
+                */
+               l2hdr->b_compress = ZIO_COMPRESS_LZ4;
+               l2hdr->b_asize = csize;
+               l2hdr->b_tmp_cdata = cdata;
+               ARCSTAT_BUMP(arcstat_l2_compress_successes);
+               return (B_TRUE);
+       } else {
+               /*
+                * Compression failed, release the compressed buffer.
+                * l2hdr will be left unmodified.
+                */
+               zio_data_buf_free(cdata, len);
+               ARCSTAT_BUMP(arcstat_l2_compress_failures);
+               return (B_FALSE);
+       }
+}
+
+/*
+ * Decompresses a zio read back from an l2arc device. On success, the
+ * underlying zio's io_data buffer is overwritten by the uncompressed
+ * version. On decompression error (corrupt compressed stream), the
+ * zio->io_error value is set to signal an I/O error.
+ *
+ * Please note that the compressed data stream is not checksummed, so
+ * if the underlying device is experiencing data corruption, we may feed
+ * corrupt data to the decompressor, so the decompressor needs to be
+ * able to handle this situation (LZ4 does).
+ */
+static void
+l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
+{
+       uint64_t csize;
+       void *cdata;
+
+       ASSERT(L2ARC_IS_VALID_COMPRESS(c));
+
+       if (zio->io_error != 0) {
+               /*
+                * An io error has occured, just restore the original io
+                * size in preparation for a main pool read.
+                */
+               zio->io_orig_size = zio->io_size = hdr->b_size;
+               return;
+       }
+
+       if (c == ZIO_COMPRESS_EMPTY) {
+               /*
+                * An empty buffer results in a null zio, which means we
+                * need to fill its io_data after we're done restoring the
+                * buffer's contents.
+                */
+               ASSERT(hdr->b_buf != NULL);
+               bzero(hdr->b_buf->b_data, hdr->b_size);
+               zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
+       } else {
+               ASSERT(zio->io_data != NULL);
+               /*
+                * We copy the compressed data from the start of the arc buffer
+                * (the zio_read will have pulled in only what we need, the
+                * rest is garbage which we will overwrite at decompression)
+                * and then decompress back to the ARC data buffer. This way we
+                * can minimize copying by simply decompressing back over the
+                * original compressed data (rather than decompressing to an
+                * aux buffer and then copying back the uncompressed buffer,
+                * which is likely to be much larger).
+                */
+               csize = zio->io_size;
+               cdata = zio_data_buf_alloc(csize);
+               bcopy(zio->io_data, cdata, csize);
+               if (zio_decompress_data(c, cdata, zio->io_data, csize,
+                   hdr->b_size) != 0)
+                       zio->io_error = EIO;
+               zio_data_buf_free(cdata, csize);
+       }
+
+       /* Restore the expected uncompressed IO size. */
+       zio->io_orig_size = zio->io_size = hdr->b_size;
+}
+
+/*
+ * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
+ * This buffer serves as a temporary holder of compressed data while
+ * the buffer entry is being written to an l2arc device. Once that is
+ * done, we can dispose of it.
+ */
+static void
+l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
+{
+       l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
+
+       if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
+               /*
+                * If the data was compressed, then we've allocated a
+                * temporary buffer for it, so now we need to release it.
+                */
+               ASSERT(l2hdr->b_tmp_cdata != NULL);
+               zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
+       }
+       l2hdr->b_tmp_cdata = NULL;
 }
 
 /*
@@ -4761,6 +5057,7 @@ l2arc_feed_thread(void)
        spa_t *spa;
        uint64_t size, wrote;
        clock_t begin, next = ddi_get_lbolt();
+       boolean_t headroom_boost = B_FALSE;
 
        CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
@@ -4821,7 +5118,7 @@ l2arc_feed_thread(void)
 
                ARCSTAT_BUMP(arcstat_l2_feeds);
 
-               size = l2arc_write_size(dev);
+               size = l2arc_write_size();
 
                /*
                 * Evict L2ARC buffers that will be overwritten.
@@ -4831,7 +5128,7 @@ l2arc_feed_thread(void)
                /*
                 * Write ARC buffers.
                 */
-               wrote = l2arc_write_buffers(spa, dev, size);
+               wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
 
                /*
                 * Calculate interval between writes.
@@ -4879,8 +5176,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
        adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
        adddev->l2ad_spa = spa;
        adddev->l2ad_vdev = vd;
-       adddev->l2ad_write = l2arc_write_max;
-       adddev->l2ad_boost = l2arc_write_boost;
        adddev->l2ad_start = VDEV_LABEL_START_SIZE;
        adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
        adddev->l2ad_hand = adddev->l2ad_start;
@@ -4888,7 +5183,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
        adddev->l2ad_first = B_TRUE;
        adddev->l2ad_writing = B_FALSE;
        list_link_init(&adddev->l2ad_node);
-       ASSERT3U(adddev->l2ad_write, >, 0);
 
        /*
         * This is a list of all ARC buffers that are still valid on the
@@ -5021,52 +5315,64 @@ EXPORT_SYMBOL(arc_getbuf_func);
 EXPORT_SYMBOL(arc_add_prune_callback);
 EXPORT_SYMBOL(arc_remove_prune_callback);
 
-module_param(zfs_arc_min, ulong, 0444);
+module_param(zfs_arc_min, ulong, 0644);
 MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
 
-module_param(zfs_arc_max, ulong, 0444);
+module_param(zfs_arc_max, ulong, 0644);
 MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
 
-module_param(zfs_arc_meta_limit, ulong, 0444);
+module_param(zfs_arc_meta_limit, ulong, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
 
-module_param(zfs_arc_meta_prune, int, 0444);
+module_param(zfs_arc_meta_prune, int, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
 
-module_param(zfs_arc_grow_retry, int, 0444);
+module_param(zfs_arc_grow_retry, int, 0644);
 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
 
-module_param(zfs_arc_shrink_shift, int, 0444);
+module_param(zfs_arc_shrink_shift, int, 0644);
 MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
 
-module_param(zfs_arc_p_min_shift, int, 0444);
+module_param(zfs_arc_p_min_shift, int, 0644);
 MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
 
 module_param(zfs_disable_dup_eviction, int, 0644);
 MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
 
-module_param(l2arc_write_max, ulong, 0444);
+module_param(zfs_arc_memory_throttle_disable, int, 0644);
+MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
+
+module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
+
+module_param(l2arc_write_max, ulong, 0644);
 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
 
-module_param(l2arc_write_boost, ulong, 0444);
+module_param(l2arc_write_boost, ulong, 0644);
 MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
 
-module_param(l2arc_headroom, ulong, 0444);
+module_param(l2arc_headroom, ulong, 0644);
 MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
 
-module_param(l2arc_feed_secs, ulong, 0444);
+module_param(l2arc_headroom_boost, ulong, 0644);
+MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
+
+module_param(l2arc_feed_secs, ulong, 0644);
 MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
 
-module_param(l2arc_feed_min_ms, ulong, 0444);
+module_param(l2arc_feed_min_ms, ulong, 0644);
 MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
 
-module_param(l2arc_noprefetch, int, 0444);
+module_param(l2arc_noprefetch, int, 0644);
 MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
 
-module_param(l2arc_feed_again, int, 0444);
+module_param(l2arc_nocompress, int, 0644);
+MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
+
+module_param(l2arc_feed_again, int, 0644);
 MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
 
-module_param(l2arc_norw, int, 0444);
+module_param(l2arc_norw, int, 0644);
 MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
 
 #endif