Illumos #883: ZIL reuse during remount corruption
authorEric Schrock <Eric.Schrock@delphix.com>
Tue, 26 Jul 2011 19:41:53 +0000 (12:41 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Mon, 1 Aug 2011 19:09:11 +0000 (12:09 -0700)
Moving the zil_free() cleanup to zil_close() prevents this
problem from occurring in the first place.  There is a very
good description of the issue and fix in Illumus #883.

Reviewed by: Matt Ahrens <Matt.Ahrens@delphix.com>
Reviewed by: Adam Leventhal <Adam.Leventhal@delphix.com>
Reviewed by: Albert Lee <trisk@nexenta.com>
Reviewed by: Gordon Ross <gwr@nexenta.com>
Reviewed by: Garrett D'Amore <garrett@nexenta.com>
Reivewed by: Dan McDonald <danmcd@nexenta.com>
Approved by: Gordon Ross <gwr@nexenta.com>

References to Illumos issue and patch:
- https://www.illumos.org/issues/883
- https://github.com/illumos/illumos-gate/commit/c9ba2a43cb

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340

cmd/ztest/ztest.c
module/zfs/zil.c

index 235bf56..7158158 100644 (file)
@@ -206,6 +206,7 @@ typedef struct ztest_od {
  */
 typedef struct ztest_ds {
        objset_t        *zd_os;
+       krwlock_t       zd_zilog_lock;
        zilog_t         *zd_zilog;
        uint64_t        zd_seq;
        ztest_od_t      *zd_od;         /* debugging aid */
@@ -239,6 +240,7 @@ ztest_func_t ztest_dmu_commit_callbacks;
 ztest_func_t ztest_zap;
 ztest_func_t ztest_zap_parallel;
 ztest_func_t ztest_zil_commit;
+ztest_func_t ztest_zil_remount;
 ztest_func_t ztest_dmu_read_write_zcopy;
 ztest_func_t ztest_dmu_objset_create_destroy;
 ztest_func_t ztest_dmu_prealloc;
@@ -274,6 +276,7 @@ ztest_info_t ztest_info[] = {
        { ztest_zap_parallel,                   100,    &zopt_always    },
        { ztest_split_pool,                     1,      &zopt_always    },
        { ztest_zil_commit,                     1,      &zopt_incessant },
+       { ztest_zil_remount,                    1,      &zopt_sometimes },
        { ztest_dmu_read_write_zcopy,           1,      &zopt_often     },
        { ztest_dmu_objset_create_destroy,      1,      &zopt_often     },
        { ztest_dsl_prop_get_set,               1,      &zopt_often     },
@@ -1007,6 +1010,7 @@ ztest_zd_init(ztest_ds_t *zd, objset_t *os)
        dmu_objset_name(os, zd->zd_name);
        int l;
 
+       rw_init(&zd->zd_zilog_lock, NULL, RW_DEFAULT, NULL);
        mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL);
 
        for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
@@ -1022,6 +1026,7 @@ ztest_zd_fini(ztest_ds_t *zd)
        int l;
 
        mutex_destroy(&zd->zd_dirobj_lock);
+       rw_destroy(&zd->zd_zilog_lock);
 
        for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
                ztest_rll_destroy(&zd->zd_object_lock[l]);
@@ -1993,6 +1998,8 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
        if (ztest_random(2) == 0)
                io_type = ZTEST_IO_WRITE_TAG;
 
+       (void) rw_enter(&zd->zd_zilog_lock, RW_READER);
+
        switch (io_type) {
 
        case ZTEST_IO_WRITE_TAG:
@@ -2030,6 +2037,8 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
                break;
        }
 
+       (void) rw_exit(&zd->zd_zilog_lock);
+
        umem_free(data, blocksize);
 }
 
@@ -2084,6 +2093,8 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
 {
        zilog_t *zilog = zd->zd_zilog;
 
+       (void) rw_enter(&zd->zd_zilog_lock, RW_READER);
+
        zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
 
        /*
@@ -2095,6 +2106,31 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
        ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
        zd->zd_seq = zilog->zl_commit_lr_seq;
        mutex_exit(&zilog->zl_lock);
+
+       (void) rw_exit(&zd->zd_zilog_lock);
+}
+
+/*
+ * This function is designed to simulate the operations that occur during a
+ * mount/unmount operation.  We hold the dataset across these operations in an
+ * attempt to expose any implicit assumptions about ZIL management.
+ */
+/* ARGSUSED */
+void
+ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
+{
+       objset_t *os = zd->zd_os;
+
+       (void) rw_enter(&zd->zd_zilog_lock, RW_WRITER);
+
+       /* zfsvfs_teardown() */
+       zil_close(zd->zd_zilog);
+
+       /* zfsvfs_setup() */
+       VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
+       zil_replay(os, zd, ztest_replay_vector);
+
+       (void) rw_exit(&zd->zd_zilog_lock);
 }
 
 /*
index 8aa811d..5296b38 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -562,7 +563,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 
        if (!list_is_empty(&zilog->zl_lwb_list)) {
                ASSERT(zh->zh_claim_txg == 0);
-               ASSERT(!keep_first);
+               VERIFY(!keep_first);
                while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
                        list_remove(&zilog->zl_lwb_list, lwb);
                        if (lwb->lwb_buf != NULL)
@@ -1665,21 +1666,11 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 void
 zil_free(zilog_t *zilog)
 {
-       lwb_t *head_lwb;
        int i;
 
        zilog->zl_stop_sync = 1;
 
-       /*
-        * After zil_close() there should only be one lwb with a buffer.
-        */
-       head_lwb = list_head(&zilog->zl_lwb_list);
-       if (head_lwb) {
-               ASSERT(head_lwb == list_tail(&zilog->zl_lwb_list));
-               list_remove(&zilog->zl_lwb_list, head_lwb);
-               zio_buf_free(head_lwb->lwb_buf, head_lwb->lwb_sz);
-               kmem_cache_free(zil_lwb_cache, head_lwb);
-       }
+       ASSERT(list_is_empty(&zilog->zl_lwb_list));
        list_destroy(&zilog->zl_lwb_list);
 
        avl_destroy(&zilog->zl_vdev_tree);
@@ -1719,6 +1710,10 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
 {
        zilog_t *zilog = dmu_objset_zil(os);
 
+       ASSERT(zilog->zl_clean_taskq == NULL);
+       ASSERT(zilog->zl_get_data == NULL);
+       ASSERT(list_is_empty(&zilog->zl_lwb_list));
+
        zilog->zl_get_data = get_data;
        zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
            2, 2, TASKQ_PREPOPULATE);
@@ -1732,7 +1727,7 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
 void
 zil_close(zilog_t *zilog)
 {
-       lwb_t *tail_lwb;
+       lwb_t *lwb;
        uint64_t txg = 0;
 
        zil_commit(zilog, 0); /* commit all itx */
@@ -1744,9 +1739,9 @@ zil_close(zilog_t *zilog)
         * destroy the zl_clean_taskq.
         */
        mutex_enter(&zilog->zl_lock);
-       tail_lwb = list_tail(&zilog->zl_lwb_list);
-       if (tail_lwb != NULL)
-               txg = tail_lwb->lwb_max_txg;
+       lwb = list_tail(&zilog->zl_lwb_list);
+       if (lwb != NULL)
+               txg = lwb->lwb_max_txg;
        mutex_exit(&zilog->zl_lock);
        if (txg)
                txg_wait_synced(zilog->zl_dmu_pool, txg);
@@ -1754,6 +1749,19 @@ zil_close(zilog_t *zilog)
        taskq_destroy(zilog->zl_clean_taskq);
        zilog->zl_clean_taskq = NULL;
        zilog->zl_get_data = NULL;
+
+       /*
+        * We should have only one LWB left on the list; remove it now.
+        */
+       mutex_enter(&zilog->zl_lock);
+       lwb = list_head(&zilog->zl_lwb_list);
+       if (lwb != NULL) {
+               ASSERT(lwb == list_tail(&zilog->zl_lwb_list));
+               list_remove(&zilog->zl_lwb_list, lwb);
+               zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+               kmem_cache_free(zil_lwb_cache, lwb);
+       }
+       mutex_exit(&zilog->zl_lock);
 }
 
 /*