Illumos #3137 L2ARC compression
[zfs.git] / module / zfs / dbuf.c
index add2bc3..faa6cc3 100644 (file)
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
+#include <sys/arc.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
@@ -258,7 +262,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
                boolean_t is_metadata;
 
                DB_DNODE_ENTER(db);
-               is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata;
+               is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
                DB_DNODE_EXIT(db);
 
                return (is_metadata);
@@ -293,7 +297,13 @@ dbuf_init(void)
 
 retry:
        h->hash_table_mask = hsize - 1;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+       /* Large allocations which do not require contiguous pages
+        * should be using vmem_alloc() in the linux kernel */
+       h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_PUSHPAGE);
+#else
        h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
+#endif
        if (h->hash_table == NULL) {
                /* XXX - we should really return an error instead of assert */
                ASSERT(hsize > (1ULL << 10));
@@ -317,7 +327,13 @@ dbuf_fini(void)
 
        for (i = 0; i < DBUF_MUTEXES; i++)
                mutex_destroy(&h->hash_mutexes[i]);
+#if defined(_KERNEL) && defined(HAVE_SPL)
+       /* Large allocations which do not require contiguous pages
+        * should be using vmem_free() in the linux kernel */
+       vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+#else
        kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+#endif
        kmem_cache_destroy(dbuf_cache);
 }
 
@@ -358,7 +374,7 @@ dbuf_verify(dmu_buf_impl_t *db)
        } else if (db->db_blkid == DMU_SPILL_BLKID) {
                ASSERT(dn != NULL);
                ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
-               ASSERT3U(db->db.db_offset, ==, 0);
+               ASSERT0(db->db.db_offset);
        } else {
                ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
        }
@@ -544,7 +560,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
        spa_t *spa;
        zbookmark_t zb;
        uint32_t aflags = ARC_NOWAIT;
-       arc_buf_t *pbuf;
 
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
@@ -600,20 +615,16 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 
        if (DBUF_IS_L2CACHEABLE(db))
                aflags |= ARC_L2CACHE;
+       if (DBUF_IS_L2COMPRESSIBLE(db))
+               aflags |= ARC_L2COMPRESS;
 
        SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
            db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
            db->db.db_object, db->db_level, db->db_blkid);
 
        dbuf_add_ref(db, NULL);
-       /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
-
-       if (db->db_parent)
-               pbuf = db->db_parent->db_buf;
-       else
-               pbuf = db->db_objset->os_phys_buf;
 
-       (void) dsl_read(zio, spa, db->db_blkptr, pbuf,
+       (void) arc_read(zio, spa, db->db_blkptr,
            dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
            (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
            &aflags, &zb);
@@ -1011,7 +1022,6 @@ void
 dbuf_release_bp(dmu_buf_impl_t *db)
 {
        objset_t *os;
-       zbookmark_t zb;
 
        DB_GET_OBJSET(&os, db);
        ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
@@ -1019,13 +1029,7 @@ dbuf_release_bp(dmu_buf_impl_t *db)
            list_link_active(&os->os_dsl_dataset->ds_synced_link));
        ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
 
-       zb.zb_objset = os->os_dsl_dataset ?
-           os->os_dsl_dataset->ds_object : 0;
-       zb.zb_object = db->db.db_object;
-       zb.zb_level = db->db_level;
-       zb.zb_blkid = db->db_blkid;
-       (void) arc_release_bp(db->db_buf, db,
-           db->db_blkptr, os->os_spa, &zb);
+       (void) arc_release(db->db_buf, db);
 }
 
 dbuf_dirty_record_t *
@@ -1082,7 +1086,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                dn->dn_dirtyctx =
                    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
                ASSERT(dn->dn_dirtyctx_firstset == NULL);
-               dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+               dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_PUSHPAGE);
        }
        mutex_exit(&dn->dn_mtx);
 
@@ -1159,7 +1163,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
         * to make a copy of it so that the changes we make in this
         * transaction group won't leak out when we sync the older txg.
         */
-       dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
+       dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_PUSHPAGE);
        list_link_init(&dr->dr_dirty_node);
        if (db->db_level == 0) {
                void *data_old = db->db_buf;
@@ -1334,13 +1338,17 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
         * it, since one of the current holders may be in the
         * middle of an update.  Note that users of dbuf_undirty()
         * should not place a hold on the dbuf before the call.
+        * Also note: we can get here with a spill block, so
+        * test for that similar to how dbuf_dirty does.
         */
        if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
                mutex_exit(&db->db_mtx);
                /* Make sure we don't toss this buffer at sync phase */
-               mutex_enter(&dn->dn_mtx);
-               dnode_clear_range(dn, db->db_blkid, 1, tx);
-               mutex_exit(&dn->dn_mtx);
+               if (db->db_blkid != DMU_SPILL_BLKID) {
+                       mutex_enter(&dn->dn_mtx);
+                       dnode_clear_range(dn, db->db_blkid, 1, tx);
+                       mutex_exit(&dn->dn_mtx);
+               }
                DB_DNODE_EXIT(db);
                return (0);
        }
@@ -1353,11 +1361,18 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
        *drp = dr->dr_next;
 
+       /*
+        * Note that there are three places in dbuf_dirty()
+        * where this dirty record may be put on a list.
+        * Make sure to do a list_remove corresponding to
+        * every one of those list_insert calls.
+        */
        if (dr->dr_parent) {
                mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
                list_remove(&dr->dr_parent->dt.di.dr_children, dr);
                mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
-       } else if (db->db_level+1 == dn->dn_nlevels) {
+       } else if (db->db_blkid == DMU_SPILL_BLKID ||
+           db->db_level+1 == dn->dn_nlevels) {
                ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
                mutex_enter(&dn->dn_mtx);
                list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
@@ -1694,7 +1709,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
        ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
        ASSERT(dn->dn_type != DMU_OT_NONE);
 
-       db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
+       db = kmem_cache_alloc(dbuf_cache, KM_PUSHPAGE);
 
        db->db_objset = os;
        db->db.db_object = dn->dn_object;
@@ -1860,7 +1875,6 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
                if (bp && !BP_IS_HOLE(bp)) {
                        int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
                            ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
-                       arc_buf_t *pbuf;
                        dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
                        uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
                        zbookmark_t zb;
@@ -1868,13 +1882,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
                        SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
                            dn->dn_object, 0, blkid);
 
-                       if (db)
-                               pbuf = db->db_buf;
-                       else
-                               pbuf = dn->dn_objset->os_phys_buf;
-
-                       (void) dsl_read(NULL, dn->dn_objset->os_spa,
-                           bp, pbuf, NULL, NULL, priority,
+                       (void) arc_read(NULL, dn->dn_objset->os_spa,
+                           bp, NULL, NULL, priority,
                            ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
                            &aflags, &zb);
                }
@@ -1994,7 +2003,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
        int error;
 
        dh = kmem_zalloc(sizeof(struct dbuf_hold_impl_data) *
-           DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP);
+           DBUF_HOLD_IMPL_MAX_DEPTH, KM_PUSHPAGE);
        __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0);
 
        error = __dbuf_hold_impl(dh);
@@ -2163,7 +2172,24 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
                        dbuf_evict(db);
                } else {
                        VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
-                       if (!DBUF_IS_CACHEABLE(db))
+
+                       /*
+                        * A dbuf will be eligible for eviction if either the
+                        * 'primarycache' property is set or a duplicate
+                        * copy of this buffer is already cached in the arc.
+                        *
+                        * In the case of the 'primarycache' a buffer
+                        * is considered for eviction if it matches the
+                        * criteria set in the property.
+                        *
+                        * To decide if our buffer is considered a
+                        * duplicate, we must call into the arc to determine
+                        * if multiple buffers are referencing the same
+                        * block on-disk. If so, then we simply evict
+                        * ourselves.
+                        */
+                       if (!DBUF_IS_CACHEABLE(db) ||
+                           arc_buf_eviction_needed(db->db_buf))
                                dbuf_clear(db);
                        else
                                mutex_exit(&db->db_mtx);
@@ -2291,7 +2317,11 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
        }
 }
 
-static void
+/* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
+ * is critical the we not allow the compiler to inline this function in to
+ * dbuf_sync_list() thereby drastically bloating the stack usage.
+ */
+noinline static void
 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
        dmu_buf_impl_t *db = dr->dr_dbuf;
@@ -2334,7 +2364,11 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
        zio_nowait(zio);
 }
 
-static void
+/* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
+ * critical the we not allow the compiler to inline this function in to
+ * dbuf_sync_list() thereby drastically bloating the stack usage.
+ */
+noinline static void
 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
        arc_buf_t **datap = &dr->dt.dl.dr_data;
@@ -2382,7 +2416,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
                dbuf_dirty_record_t **drp;
 
                ASSERT(*datap != NULL);
-               ASSERT3U(db->db_level, ==, 0);
+               ASSERT0(db->db_level);
                ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
                bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
                DB_DNODE_EXIT(db);
@@ -2585,7 +2619,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
        uint64_t txg = zio->io_txg;
        dbuf_dirty_record_t **drp, *dr;
 
-       ASSERT3U(zio->io_error, ==, 0);
+       ASSERT0(zio->io_error);
        ASSERT(db->db_blkptr == bp);
 
        if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
@@ -2791,8 +2825,47 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
        } else {
                ASSERT(arc_released(data));
                dr->dr_zio = arc_write(zio, os->os_spa, txg,
-                   db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
-                   dbuf_write_ready, dbuf_write_done, db,
-                   ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-       }
-}
+                   db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
+                   DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
+                   dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
+                   ZIO_FLAG_MUSTSUCCEED, &zb);
+       }
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(dbuf_find);
+EXPORT_SYMBOL(dbuf_is_metadata);
+EXPORT_SYMBOL(dbuf_evict);
+EXPORT_SYMBOL(dbuf_loan_arcbuf);
+EXPORT_SYMBOL(dbuf_whichblock);
+EXPORT_SYMBOL(dbuf_read);
+EXPORT_SYMBOL(dbuf_unoverride);
+EXPORT_SYMBOL(dbuf_free_range);
+EXPORT_SYMBOL(dbuf_new_size);
+EXPORT_SYMBOL(dbuf_release_bp);
+EXPORT_SYMBOL(dbuf_dirty);
+EXPORT_SYMBOL(dmu_buf_will_dirty);
+EXPORT_SYMBOL(dmu_buf_will_not_fill);
+EXPORT_SYMBOL(dmu_buf_will_fill);
+EXPORT_SYMBOL(dmu_buf_fill_done);
+EXPORT_SYMBOL(dmu_buf_rele);
+EXPORT_SYMBOL(dbuf_assign_arcbuf);
+EXPORT_SYMBOL(dbuf_clear);
+EXPORT_SYMBOL(dbuf_prefetch);
+EXPORT_SYMBOL(dbuf_hold_impl);
+EXPORT_SYMBOL(dbuf_hold);
+EXPORT_SYMBOL(dbuf_hold_level);
+EXPORT_SYMBOL(dbuf_create_bonus);
+EXPORT_SYMBOL(dbuf_spill_set_blksz);
+EXPORT_SYMBOL(dbuf_rm_spill);
+EXPORT_SYMBOL(dbuf_add_ref);
+EXPORT_SYMBOL(dbuf_rele);
+EXPORT_SYMBOL(dbuf_rele_and_unlock);
+EXPORT_SYMBOL(dbuf_refcount);
+EXPORT_SYMBOL(dbuf_sync_list);
+EXPORT_SYMBOL(dmu_buf_set_user);
+EXPORT_SYMBOL(dmu_buf_set_user_ie);
+EXPORT_SYMBOL(dmu_buf_update_user);
+EXPORT_SYMBOL(dmu_buf_get_user);
+EXPORT_SYMBOL(dmu_buf_freeable);
+#endif