X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fzil.c;h=db3822f5a8873dec5978950ff20d50f04dc01209;hb=058ac9ba7811baea78a47ba1ead8acd7512684b6;hp=95101882ba1b6440193c754ddbb4bfec9cebb048;hpb=172bb4bd5e4afef721dd4d2972d8680d983f144b;p=zfs.git diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 9510188..db3822f 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -19,12 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include +#include #include #include #include @@ -351,14 +352,20 @@ zil_create(zilog_t *zilog) blk = zh->zh_log; /* - * If we don't already have an initial log block, allocate one now. + * If we don't already have an initial log block or we have one + * but it's the wrong endianness then allocate one. */ - if (BP_IS_HOLE(&blk)) { + if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); (void) dmu_tx_assign(tx, TXG_WAIT); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); + if (!BP_IS_HOLE(&blk)) { + zio_free_blk(zilog->zl_spa, &blk, txg); + BP_ZERO(&blk); + } + error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, NULL, txg); @@ -465,34 +472,22 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) } /* - * zil_rollback_destroy() is only called by the rollback code. - * We already have a syncing tx. Rollback has exclusive access to the - * dataset, so we don't have to worry about concurrent zil access. - * The actual freeing of any log blocks occurs in zil_sync() later in - * this txg syncing phase. + * return true if the initial log block is not valid */ -void -zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx) +static boolean_t +zil_empty(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; - uint64_t txg; + arc_buf_t *abuf = NULL; if (BP_IS_HOLE(&zh->zh_log)) - return; + return (B_TRUE); - txg = dmu_tx_get_txg(tx); - ASSERT3U(zilog->zl_destroy_txg, <, txg); - zilog->zl_destroy_txg = txg; - zilog->zl_keep_first = B_FALSE; + if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0) + return (B_TRUE); - /* - * Ensure there's no outstanding ZIL IO. No lwbs or just the - * unused one that allocated in advance is ok. - */ - ASSERT(zilog->zl_lwb_list.list_head.list_next == - zilog->zl_lwb_list.list_head.list_prev); - (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, - tx, zh->zh_claim_txg); + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + return (B_FALSE); } int @@ -514,6 +509,30 @@ zil_claim(char *osname, void *txarg) zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); + if (zilog->zl_spa->spa_log_state == SPA_LOG_CLEAR) { + if (!BP_IS_HOLE(&zh->zh_log)) + zio_free_blk(zilog->zl_spa, &zh->zh_log, first_txg); + BP_ZERO(&zh->zh_log); + dsl_dataset_dirty(dmu_objset_ds(os), tx); + } + + /* + * Record here whether the zil has any records to replay. + * If the header block pointer is null or the block points + * to the stubby then we know there are no valid log records. + * We use the header to store this state as the the zilog gets + * freed later in dmu_objset_close(). + * The flags (and the rest of the header fields) are cleared in + * zil_sync() as a result of a zil_destroy(), after replaying the log. + * + * Note, the intent log can be empty but still need the + * stubby to be claimed. + */ + if (!zil_empty(zilog)) { + zh->zh_flags |= ZIL_REPLAY_NEEDED; + dsl_dataset_dirty(dmu_objset_ds(os), tx); + } + /* * Claim all log blocks if we haven't already done so, and remember * the highest claimed sequence number. This ensures that if we can @@ -581,36 +600,6 @@ zil_check_log_chain(char *osname, void *txarg) return (error); } -/* - * Clear a log chain - */ -/* ARGSUSED */ -int -zil_clear_log_chain(char *osname, void *txarg) -{ - zilog_t *zilog; - zil_header_t *zh; - objset_t *os; - dmu_tx_t *tx; - int error; - - error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); - if (error) { - cmn_err(CE_WARN, "can't open objset for %s", osname); - return (0); - } - - zilog = dmu_objset_zil(os); - tx = dmu_tx_create(zilog->zl_os); - (void) dmu_tx_assign(tx, TXG_WAIT); - zh = zil_header_in_syncing_context(zilog); - BP_ZERO(&zh->zh_log); - dsl_dataset_dirty(dmu_objset_ds(os), tx); - dmu_tx_commit(tx); - dmu_objset_close(os); - return (0); -} - static int zil_vdev_compare(const void *x1, const void *x2) { @@ -713,17 +702,26 @@ zil_lwb_write_done(zio_t *zio) ASSERT(zio->io_bp->blk_fill == 0); /* - * Now that we've written this log block, we have a stable pointer - * to the next block in the chain, so it's OK to let the txg in - * which we allocated the next block sync. + * Ensure the lwb buffer pointer is cleared before releasing + * the txg. If we have had an allocation failure and + * the txg is waiting to sync then we want want zil_sync() + * to remove the lwb so that it's not picked up as the next new + * one in zil_commit_writer(). zil_sync() will only remove + * the lwb if lwb_buf is null. */ - txg_rele_to_sync(&lwb->lwb_txgh); - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_buf = NULL; if (zio->io_error) zilog->zl_log_error = B_TRUE; + + /* + * Now that we've written this log block, we have a stable pointer + * to the next block in the chain, so it's OK to let the txg in + * which we allocated the next block sync. We still have the + * zl_lock to ensure zil_sync doesn't kmem free the lwb. + */ + txg_rele_to_sync(&lwb->lwb_txgh); mutex_exit(&zilog->zl_lock); } @@ -746,9 +744,9 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) } if (lwb->lwb_zio == NULL) { lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - 0, &lwb->lwb_blk, lwb->lwb_buf, - lwb->lwb_sz, zil_lwb_write_done, lwb, - ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_CANFAIL, &zb); + 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, + zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb); } } @@ -928,6 +926,10 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) } error = zilog->zl_get_data( itx->itx_private, lr, dbuf, lwb->lwb_zio); + if (error == EIO) { + txg_wait_synced(zilog->zl_dmu_pool, txg); + return (lwb); + } if (error) { ASSERT(error == ENOENT || error == EEXIST || error == EALREADY); @@ -1034,7 +1036,7 @@ zil_clean(zilog_t *zilog) if ((itx != NULL) && (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) { (void) taskq_dispatch(zilog->zl_clean_taskq, - (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP); + (task_func_t *)zil_itx_clean, zilog, TQ_SLEEP); } mutex_exit(&zilog->zl_lock); } @@ -1210,20 +1212,26 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) spa_t *spa = zilog->zl_spa; lwb_t *lwb; + /* + * We don't zero out zl_destroy_txg, so make sure we don't try + * to destroy it twice. + */ + if (spa_sync_pass(spa) != 1) + return; + mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); - zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; + zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK]; if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; ASSERT(list_head(&zilog->zl_lwb_list) == NULL); - ASSERT(spa_sync_pass(spa) == 1); bzero(zh, sizeof (zil_header_t)); - bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq)); + bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); if (zilog->zl_keep_first) { /* @@ -1239,12 +1247,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) } } - for (;;) { - lwb = list_head(&zilog->zl_lwb_list); - if (lwb == NULL) { - mutex_exit(&zilog->zl_lock); - return; - } + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) break; @@ -1338,25 +1341,6 @@ zil_free(zilog_t *zilog) } /* - * return true if the initial log block is not valid - */ -static boolean_t -zil_empty(zilog_t *zilog) -{ - const zil_header_t *zh = zilog->zl_header; - arc_buf_t *abuf = NULL; - - if (BP_IS_HOLE(&zh->zh_log)) - return (B_TRUE); - - if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0) - return (B_TRUE); - - VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); - return (B_FALSE); -} - -/* * Open an intent log. */ zilog_t * @@ -1411,7 +1395,7 @@ zil_suspend(zilog_t *zilog) const zil_header_t *zh = zilog->zl_header; mutex_enter(&zilog->zl_lock); - if (zh->zh_claim_txg != 0) { /* unplayed log */ + if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); return (EBUSY); } @@ -1460,9 +1444,7 @@ zil_resume(zilog_t *zilog) typedef struct zil_replay_arg { objset_t *zr_os; zil_replay_func_t **zr_replay; - zil_replay_cleaner_t *zr_replay_cleaner; void *zr_arg; - uint64_t *zr_txgp; boolean_t zr_byteswap; char *zr_lrbuf; } zil_replay_arg_t; @@ -1475,9 +1457,9 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; char *name; - int pass, error, sunk; + int pass, error; - if (zilog->zl_stop_replay) + if (!zilog->zl_replay) /* giving up */ return; if (lr->lrc_txg < claim_txg) /* already committed */ @@ -1489,6 +1471,11 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; + if (txtype == 0 || txtype >= TX_MAX_TYPE) { + error = EINVAL; + goto bad; + } + /* * Make a copy of the data so we can revise and extend it. */ @@ -1539,69 +1526,16 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) } /* - * Replay of large truncates can end up needing additional txs - * and a different txg. If they are nested within the replay tx - * as below then a hang is possible. So we do the truncate here - * and redo the truncate later (a no-op) and update the sequence - * number whilst in the replay tx. Fortunately, it's safe to repeat - * a truncate if we crash and the truncate commits. A create over - * an existing file will also come in as a TX_TRUNCATE record. - * - * Note, remove of large files and renames over large files is - * handled by putting the deleted object on a stable list - * and if necessary force deleting the object outside of the replay - * transaction using the zr_replay_cleaner. - */ - if (txtype == TX_TRUNCATE) { - *zr->zr_txgp = TXG_NOWAIT; - error = zr->zr_replay[TX_TRUNCATE](zr->zr_arg, zr->zr_lrbuf, - zr->zr_byteswap); - if (error) - goto bad; - zr->zr_byteswap = 0; /* only byteswap once */ - } - - /* * We must now do two things atomically: replay this log record, - * and update the log header to reflect the fact that we did so. - * We use the DMU's ability to assign into a specific txg to do this. + * and update the log header sequence number to reflect the fact that + * we did so. At the end of each replay function the sequence number + * is updated if we are in replay mode. */ - for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) { - uint64_t replay_txg; - dmu_tx_t *replay_tx; - - replay_tx = dmu_tx_create(zr->zr_os); - error = dmu_tx_assign(replay_tx, TXG_WAIT); - if (error) { - dmu_tx_abort(replay_tx); - break; - } - - replay_txg = dmu_tx_get_txg(replay_tx); - - if (txtype == 0 || txtype >= TX_MAX_TYPE) { - error = EINVAL; - } else { - /* - * On the first pass, arrange for the replay vector - * to fail its dmu_tx_assign(). That's the only way - * to ensure that those code paths remain well tested. - * - * Only byteswap (if needed) on the 1st pass. - */ - *zr->zr_txgp = replay_txg - (pass == 1); - error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, - zr->zr_byteswap && pass == 1); - *zr->zr_txgp = TXG_NOWAIT; - } - - if (error == 0) { - dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx); - zilog->zl_replay_seq[replay_txg & TXG_MASK] = - lr->lrc_seq; - } - - dmu_tx_commit(replay_tx); + for (pass = 1; pass <= 2; pass++) { + zilog->zl_replaying_seq = lr->lrc_seq; + /* Only byteswap (if needed) on the 1st pass. */ + error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, + zr->zr_byteswap && pass == 1); if (!error) return; @@ -1609,37 +1543,22 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with - * EEXIST. So if we receive any error other than ERESTART - * we try syncing out any removes then retrying the - * transaction. + * EEXIST. So if we receive any error we try syncing out + * any removes then retry the transaction. */ - if (error != ERESTART && !sunk) { - if (zr->zr_replay_cleaner) - zr->zr_replay_cleaner(zr->zr_arg); + if (pass == 1) txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); - sunk = B_TRUE; - continue; /* retry */ - } - - if (error != ERESTART) - break; - - if (pass != 1) - txg_wait_open(spa_get_dsl(zilog->zl_spa), - replay_txg + 1); - - dprintf("pass %d, retrying\n", pass); } bad: - ASSERT(error && error != ERESTART); + ASSERT(error); name = kmem_alloc(MAXNAMELEN, KM_SLEEP); dmu_objset_name(zr->zr_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype, (lr->lrc_txtype & TX_CI) ? "CI" : ""); - zilog->zl_stop_replay = 1; + zilog->zl_replay = B_FALSE; kmem_free(name, MAXNAMELEN); } @@ -1654,24 +1573,20 @@ zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) * If this dataset has a non-empty intent log, replay it and destroy it. */ void -zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE], - zil_replay_cleaner_t *replay_cleaner) +zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; zil_replay_arg_t zr; - if (zil_empty(zilog)) { + if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { zil_destroy(zilog, B_TRUE); return; } zr.zr_os = os; zr.zr_replay = replay_func; - zr.zr_replay_cleaner = replay_cleaner; zr.zr_arg = arg; - zr.zr_txgp = txgp; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); @@ -1680,7 +1595,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, */ txg_wait_synced(zilog->zl_dmu_pool, 0); - zilog->zl_stop_replay = 0; + zilog->zl_replay = B_TRUE; zilog->zl_replay_time = lbolt; ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, @@ -1689,6 +1604,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + zilog->zl_replay = B_FALSE; } /* @@ -1733,3 +1649,24 @@ out: mutex_exit(&zilog->zl_lock); return (ret); } + +/* ARGSUSED */ +int +zil_vdev_offline(char *osname, void *arg) +{ + objset_t *os; + zilog_t *zilog; + int error; + + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); + if (error) + return (error); + + zilog = dmu_objset_zil(os); + if (zil_suspend(zilog) != 0) + error = EEXIST; + else + zil_resume(zilog); + dmu_objset_close(os); + return (error); +}