*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
+#include <sys/arc.h>
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dbuf.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
+struct dbuf_hold_impl_data {
+ /* Function arguments */
+ dnode_t *dh_dn;
+ uint8_t dh_level;
+ uint64_t dh_blkid;
+ int dh_fail_sparse;
+ void *dh_tag;
+ dmu_buf_impl_t **dh_dbp;
+ /* Local variables */
+ dmu_buf_impl_t *dh_db;
+ dmu_buf_impl_t *dh_parent;
+ blkptr_t *dh_bp;
+ int dh_err;
+ dbuf_dirty_record_t *dh_dr;
+ arc_buf_contents_t dh_type;
+ int dh_depth;
+};
+
+static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
+ dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+ void *tag, dmu_buf_impl_t **dbp, int depth);
+static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
+
static void dbuf_destroy(dmu_buf_impl_t *db);
static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
refcount_create(&db->db_holds);
+ list_link_init(&db->db_link);
return (0);
}
retry:
h->hash_table_mask = hsize - 1;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+ /* Large allocations which do not require contiguous pages
+ * should be using vmem_alloc() in the linux kernel */
+ h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+#else
h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
+#endif
if (h->hash_table == NULL) {
/* XXX - we should really return an error instead of assert */
ASSERT(hsize > (1ULL << 10));
for (i = 0; i < DBUF_MUTEXES; i++)
mutex_destroy(&h->hash_mutexes[i]);
+#if defined(_KERNEL) && defined(HAVE_SPL)
+ /* Large allocations which do not require contiguous pages
+ * should be using vmem_free() in the linux kernel */
+ vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+#else
kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+#endif
kmem_cache_destroy(dbuf_cache);
}
&dn->dn_phys->dn_blkptr[db->db_blkid]);
} else {
/* db is pointed to by an indirect block */
- int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
+ ASSERTV(int epb = db->db_parent->db.db_size >>
+ SPA_BLKPTRSHIFT);
ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
ASSERT3U(db->db_parent->db.db_object, ==,
db->db.db_object);
* data when we evict this buffer.
*/
if (db->db_dirtycnt == 0) {
- uint64_t *buf = db->db.db_data;
+ ASSERTV(uint64_t *buf = db->db.db_data);
int i;
for (i = 0; i < db->db.db_size >> 3; i++) {
dn->dn_dirtyctx =
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
ASSERT(dn->dn_dirtyctx_firstset == NULL);
- dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+ dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_PUSHPAGE);
}
mutex_exit(&dn->dn_mtx);
* to make a copy of it so that the changes we make in this
* transaction group won't leak out when we sync the older txg.
*/
- dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
+ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_PUSHPAGE);
+ list_link_init(&dr->dr_dirty_node);
if (db->db_level == 0) {
void *data_old = db->db_buf;
* it, since one of the current holders may be in the
* middle of an update. Note that users of dbuf_undirty()
* should not place a hold on the dbuf before the call.
+ * Also note: we can get here with a spill block, so
+ * test for that similar to how dbuf_dirty does.
*/
if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
mutex_exit(&db->db_mtx);
/* Make sure we don't toss this buffer at sync phase */
- mutex_enter(&dn->dn_mtx);
- dnode_clear_range(dn, db->db_blkid, 1, tx);
- mutex_exit(&dn->dn_mtx);
+ if (db->db_blkid != DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, db->db_blkid, 1, tx);
+ mutex_exit(&dn->dn_mtx);
+ }
DB_DNODE_EXIT(db);
return (0);
}
*drp = dr->dr_next;
+ /*
+ * Note that there are three places in dbuf_dirty()
+ * where this dirty record may be put on a list.
+ * Make sure to do a list_remove corresponding to
+ * every one of those list_insert calls.
+ */
if (dr->dr_parent) {
mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
list_remove(&dr->dr_parent->dt.di.dr_children, dr);
mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
- } else if (db->db_level+1 == dn->dn_nlevels) {
+ } else if (db->db_blkid == DMU_SPILL_BLKID ||
+ db->db_level+1 == dn->dn_nlevels) {
ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
dbuf_rele(parent, db);
}
-static int
+__attribute__((always_inline))
+static inline int
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
- dmu_buf_impl_t **parentp, blkptr_t **bpp)
+ dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh)
{
int nlevels, epbs;
return (ENOENT);
} else if (level < nlevels-1) {
/* this block is referenced from an indirect block */
- int err = dbuf_hold_impl(dn, level+1,
- blkid >> epbs, fail_sparse, NULL, parentp);
+ int err;
+ if (dh == NULL) {
+ err = dbuf_hold_impl(dn, level+1, blkid >> epbs,
+ fail_sparse, NULL, parentp);
+ }
+ else {
+ __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1,
+ blkid >> epbs, fail_sparse, NULL,
+ parentp, dh->dh_depth + 1);
+ err = __dbuf_hold_impl(dh + 1);
+ }
if (err)
return (err);
err = dbuf_read(*parentp, NULL,
return;
/* dbuf_find() returns with db_mtx held */
- if (db = dbuf_find(dn, 0, blkid)) {
+ if ((db = dbuf_find(dn, 0, blkid))) {
/*
* This dbuf is already in the cache. We assume that
* it is already CACHED, or else about to be either
return;
}
- if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
+ if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
if (bp && !BP_IS_HOLE(bp)) {
int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
}
}
+#define DBUF_HOLD_IMPL_MAX_DEPTH 20
+
/*
* Returns with db_holds incremented, and db_mtx not held.
* Note: dn_struct_rwlock must be held.
*/
-int
-dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
- void *tag, dmu_buf_impl_t **dbp)
+static int
+__dbuf_hold_impl(struct dbuf_hold_impl_data *dh)
{
- dmu_buf_impl_t *db, *parent = NULL;
+ ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH);
+ dh->dh_parent = NULL;
- ASSERT(blkid != DMU_BONUS_BLKID);
- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
- ASSERT3U(dn->dn_nlevels, >, level);
+ ASSERT(dh->dh_blkid != DMU_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock));
+ ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level);
- *dbp = NULL;
+ *(dh->dh_dbp) = NULL;
top:
/* dbuf_find() returns with db_mtx held */
- db = dbuf_find(dn, level, blkid);
-
- if (db == NULL) {
- blkptr_t *bp = NULL;
- int err;
-
- ASSERT3P(parent, ==, NULL);
- err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
- if (fail_sparse) {
- if (err == 0 && bp && BP_IS_HOLE(bp))
- err = ENOENT;
- if (err) {
- if (parent)
- dbuf_rele(parent, NULL);
- return (err);
+ dh->dh_db = dbuf_find(dh->dh_dn, dh->dh_level, dh->dh_blkid);
+
+ if (dh->dh_db == NULL) {
+ dh->dh_bp = NULL;
+
+ ASSERT3P(dh->dh_parent, ==, NULL);
+ dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid,
+ dh->dh_fail_sparse, &dh->dh_parent,
+ &dh->dh_bp, dh);
+ if (dh->dh_fail_sparse) {
+ if (dh->dh_err == 0 && dh->dh_bp && BP_IS_HOLE(dh->dh_bp))
+ dh->dh_err = ENOENT;
+ if (dh->dh_err) {
+ if (dh->dh_parent)
+ dbuf_rele(dh->dh_parent, NULL);
+ return (dh->dh_err);
}
}
- if (err && err != ENOENT)
- return (err);
- db = dbuf_create(dn, level, blkid, parent, bp);
- }
-
- if (db->db_buf && refcount_is_zero(&db->db_holds)) {
- arc_buf_add_ref(db->db_buf, db);
- if (db->db_buf->b_data == NULL) {
- dbuf_clear(db);
- if (parent) {
- dbuf_rele(parent, NULL);
- parent = NULL;
+ if (dh->dh_err && dh->dh_err != ENOENT)
+ return (dh->dh_err);
+ dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid,
+ dh->dh_parent, dh->dh_bp);
+ }
+
+ if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) {
+ arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db);
+ if (dh->dh_db->db_buf->b_data == NULL) {
+ dbuf_clear(dh->dh_db);
+ if (dh->dh_parent) {
+ dbuf_rele(dh->dh_parent, NULL);
+ dh->dh_parent = NULL;
}
goto top;
}
- ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+ ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data);
}
- ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
+ ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf));
/*
* If this buffer is currently syncing out, and we are are
* still referencing it from db_data, we need to make a copy
* of it in case we decide we want to dirty it again in this txg.
*/
- if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
- dn->dn_object != DMU_META_DNODE_OBJECT &&
- db->db_state == DB_CACHED && db->db_data_pending) {
- dbuf_dirty_record_t *dr = db->db_data_pending;
-
- if (dr->dt.dl.dr_data == db->db_buf) {
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-
- dbuf_set_data(db,
- arc_buf_alloc(dn->dn_objset->os_spa,
- db->db.db_size, db, type));
- bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
- db->db.db_size);
+ if (dh->dh_db->db_level == 0 &&
+ dh->dh_db->db_blkid != DMU_BONUS_BLKID &&
+ dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT &&
+ dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) {
+ dh->dh_dr = dh->dh_db->db_data_pending;
+
+ if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) {
+ dh->dh_type = DBUF_GET_BUFC_TYPE(dh->dh_db);
+
+ dbuf_set_data(dh->dh_db,
+ arc_buf_alloc(dh->dh_dn->dn_objset->os_spa,
+ dh->dh_db->db.db_size, dh->dh_db, dh->dh_type));
+ bcopy(dh->dh_dr->dt.dl.dr_data->b_data,
+ dh->dh_db->db.db_data, dh->dh_db->db.db_size);
}
}
- (void) refcount_add(&db->db_holds, tag);
- dbuf_update_data(db);
- DBUF_VERIFY(db);
- mutex_exit(&db->db_mtx);
+ (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
+ dbuf_update_data(dh->dh_db);
+ DBUF_VERIFY(dh->dh_db);
+ mutex_exit(&dh->dh_db->db_mtx);
/* NOTE: we can't rele the parent until after we drop the db_mtx */
- if (parent)
- dbuf_rele(parent, NULL);
+ if (dh->dh_parent)
+ dbuf_rele(dh->dh_parent, NULL);
- ASSERT3P(DB_DNODE(db), ==, dn);
- ASSERT3U(db->db_blkid, ==, blkid);
- ASSERT3U(db->db_level, ==, level);
- *dbp = db;
+ ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn);
+ ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid);
+ ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level);
+ *(dh->dh_dbp) = dh->dh_db;
return (0);
}
+/*
+ * The following code preserves the recursive function dbuf_hold_impl()
+ * but moves the local variables AND function arguments to the heap to
+ * minimize the stack frame size. Enough space is initially allocated
+ * on the stack for 20 levels of recursion.
+ */
+int
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+ void *tag, dmu_buf_impl_t **dbp)
+{
+ struct dbuf_hold_impl_data *dh;
+ int error;
+
+ dh = kmem_zalloc(sizeof(struct dbuf_hold_impl_data) *
+ DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP);
+ __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0);
+
+ error = __dbuf_hold_impl(dh);
+
+ kmem_free(dh, sizeof(struct dbuf_hold_impl_data) *
+ DBUF_HOLD_IMPL_MAX_DEPTH);
+
+ return (error);
+}
+
+static void
+__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
+ dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+ void *tag, dmu_buf_impl_t **dbp, int depth)
+{
+ dh->dh_dn = dn;
+ dh->dh_level = level;
+ dh->dh_blkid = blkid;
+ dh->dh_fail_sparse = fail_sparse;
+ dh->dh_tag = tag;
+ dh->dh_dbp = dbp;
+ dh->dh_depth = depth;
+}
+
dmu_buf_impl_t *
dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
{
void
dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
{
- int64_t holds = refcount_add(&db->db_holds, tag);
- ASSERT(holds > 1);
+ VERIFY(refcount_add(&db->db_holds, tag) > 1);
}
/*
}
}
-static void
+/* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
+ * is critical the we not allow the compiler to inline this function in to
+ * dbuf_sync_list() thereby drastically bloating the stack usage.
+ */
+noinline static void
dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
zio_nowait(zio);
}
-static void
+/* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
+ * critical the we not allow the compiler to inline this function in to
+ * dbuf_sync_list() thereby drastically bloating the stack usage.
+ */
+noinline static void
dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
arc_buf_t **datap = &dr->dt.dl.dr_data;
ASSERT(dr->dr_next == NULL);
ASSERT(dr->dr_dbuf == db);
*drp = dr->dr_next;
+ if (dr->dr_dbuf->db_level != 0) {
+ mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
+ }
kmem_free(dr, sizeof (dbuf_dirty_record_t));
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
{
dbuf_dirty_record_t *dr;
- while (dr = list_head(list)) {
+ while ((dr = list_head(list))) {
if (dr->dr_zio != NULL) {
/*
* If we find an already initialized zio then we
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
if (!BP_IS_HOLE(db->db_blkptr)) {
- int epbs =
- dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ASSERTV(int epbs = dn->dn_phys->dn_indblkshift -
+ SPA_BLKPTRSHIFT);
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
db->db.db_size);
ASSERT3U(dn->dn_phys->dn_maxblkid
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(dmu_buf_rele);
+EXPORT_SYMBOL(dmu_buf_will_dirty);
+#endif