Illumos #3329, #3330, #3331, #3335
[zfs.git] / module / zfs / space_map.c
index 0a1fd59..d99c7c0 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zio.h>
 #include <sys/space_map.h>
 
+static kmem_cache_t *space_seg_cache;
+
+void
+space_map_init(void)
+{
+       ASSERT(space_seg_cache == NULL);
+       space_seg_cache = kmem_cache_create("space_seg_cache",
+           sizeof (space_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+space_map_fini(void)
+{
+       kmem_cache_destroy(space_seg_cache);
+       space_seg_cache = NULL;
+}
+
 /*
  * Space map routines.
  * NOTE: caller is responsible for all locking.
@@ -60,6 +75,8 @@ space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
 {
        bzero(sm, sizeof (*sm));
 
+       cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL);
+
        avl_create(&sm->sm_root, space_map_seg_compare,
            sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
 
@@ -75,6 +92,7 @@ space_map_destroy(space_map_t *sm)
        ASSERT(!sm->sm_loaded && !sm->sm_loading);
        VERIFY3U(sm->sm_space, ==, 0);
        avl_destroy(&sm->sm_root);
+       cv_destroy(&sm->sm_load_cv);
 }
 
 void
@@ -115,19 +133,33 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
 
        if (merge_before && merge_after) {
                avl_remove(&sm->sm_root, ss_before);
+               if (sm->sm_pp_root) {
+                       avl_remove(sm->sm_pp_root, ss_before);
+                       avl_remove(sm->sm_pp_root, ss_after);
+               }
                ss_after->ss_start = ss_before->ss_start;
-               kmem_free(ss_before, sizeof (*ss_before));
+               kmem_cache_free(space_seg_cache, ss_before);
+               ss = ss_after;
        } else if (merge_before) {
                ss_before->ss_end = end;
+               if (sm->sm_pp_root)
+                       avl_remove(sm->sm_pp_root, ss_before);
+               ss = ss_before;
        } else if (merge_after) {
                ss_after->ss_start = start;
+               if (sm->sm_pp_root)
+                       avl_remove(sm->sm_pp_root, ss_after);
+               ss = ss_after;
        } else {
-               ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
+               ss = kmem_cache_alloc(space_seg_cache, KM_PUSHPAGE);
                ss->ss_start = start;
                ss->ss_end = end;
                avl_insert(&sm->sm_root, ss, where);
        }
 
+       if (sm->sm_pp_root)
+               avl_add(sm->sm_pp_root, ss);
+
        sm->sm_space += size;
 }
 
@@ -162,25 +194,34 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
        left_over = (ss->ss_start != start);
        right_over = (ss->ss_end != end);
 
+       if (sm->sm_pp_root)
+               avl_remove(sm->sm_pp_root, ss);
+
        if (left_over && right_over) {
-               newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
+               newseg = kmem_cache_alloc(space_seg_cache, KM_PUSHPAGE);
                newseg->ss_start = end;
                newseg->ss_end = ss->ss_end;
                ss->ss_end = start;
                avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
+               if (sm->sm_pp_root)
+                       avl_add(sm->sm_pp_root, newseg);
        } else if (left_over) {
                ss->ss_end = start;
        } else if (right_over) {
                ss->ss_start = end;
        } else {
                avl_remove(&sm->sm_root, ss);
-               kmem_free(ss, sizeof (*ss));
+               kmem_cache_free(space_seg_cache, ss);
+               ss = NULL;
        }
 
+       if (sm->sm_pp_root && ss != NULL)
+               avl_add(sm->sm_pp_root, ss);
+
        sm->sm_space -= size;
 }
 
-int
+boolean_t
 space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
 {
        avl_index_t where;
@@ -210,7 +251,7 @@ space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
        while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
                if (func != NULL)
                        func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-               kmem_free(ss, sizeof (*ss));
+               kmem_cache_free(space_seg_cache, ss);
        }
        sm->sm_space = 0;
 }
@@ -220,59 +261,10 @@ space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
 {
        space_seg_t *ss;
 
-       for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-               func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-}
-
-void
-space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
-{
-       avl_tree_t *t = &sm->sm_root;
-       avl_index_t where;
-       space_seg_t *ss, search;
-       uint64_t end = start + size;
-       uint64_t rm_start, rm_end;
-
        ASSERT(MUTEX_HELD(sm->sm_lock));
 
-       search.ss_start = start;
-       search.ss_end = start;
-
-       for (;;) {
-               ss = avl_find(t, &search, &where);
-
-               if (ss == NULL)
-                       ss = avl_nearest(t, where, AVL_AFTER);
-
-               if (ss == NULL || ss->ss_start >= end)
-                       break;
-
-               rm_start = MAX(ss->ss_start, start);
-               rm_end = MIN(ss->ss_end, end);
-
-               space_map_remove(sm, rm_start, rm_end - rm_start);
-       }
-}
-
-/*
- * Replace smd with the union of smd and sms.
- */
-void
-space_map_union(space_map_t *smd, space_map_t *sms)
-{
-       avl_tree_t *t = &sms->sm_root;
-       space_seg_t *ss;
-
-       ASSERT(MUTEX_HELD(smd->sm_lock));
-
-       /*
-        * For each source segment, remove any intersections with the
-        * destination, then add the source segment to the destination.
-        */
-       for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
-               space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
-               space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
-       }
+       for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+               func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
 }
 
 /*
@@ -283,8 +275,10 @@ space_map_load_wait(space_map_t *sm)
 {
        ASSERT(MUTEX_HELD(sm->sm_lock));
 
-       while (sm->sm_loading)
+       while (sm->sm_loading) {
+               ASSERT(!sm->sm_loaded);
                cv_wait(&sm->sm_load_cv, sm->sm_lock);
+       }
 }
 
 /*
@@ -301,11 +295,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
        int error = 0;
 
        ASSERT(MUTEX_HELD(sm->sm_lock));
-
-       space_map_load_wait(sm);
-
-       if (sm->sm_loaded)
-               return (0);
+       ASSERT(!sm->sm_loaded);
+       ASSERT(!sm->sm_loading);
 
        sm->sm_loading = B_TRUE;
        end = smo->smo_objsize;
@@ -336,7 +327,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
                    smo->smo_object, offset, size);
 
                mutex_exit(sm->sm_lock);
-               error = dmu_read(os, smo->smo_object, offset, size, entry_map);
+               error = dmu_read(os, smo->smo_object, offset, size, entry_map,
+                   DMU_READ_PREFETCH);
                mutex_enter(sm->sm_lock);
                if (error != 0)
                        break;
@@ -390,6 +382,13 @@ space_map_unload(space_map_t *sm)
 }
 
 uint64_t
+space_map_maxsize(space_map_t *sm)
+{
+       ASSERT(sm->sm_ops != NULL);
+       return (sm->sm_ops->smop_max(sm));
+}
+
+uint64_t
 space_map_alloc(space_map_t *sm, uint64_t size)
 {
        uint64_t start;
@@ -424,7 +423,7 @@ space_map_sync(space_map_t *sm, uint8_t maptype,
        spa_t *spa = dmu_objset_spa(os);
        void *cookie = NULL;
        space_seg_t *ss;
-       uint64_t bufsize, start, size, run_len;
+       uint64_t bufsize, start, size, run_len, delta, sm_space;
        uint64_t *entry, *entry_map, *entry_map_end;
 
        ASSERT(MUTEX_HELD(sm->sm_lock));
@@ -453,11 +452,13 @@ space_map_sync(space_map_t *sm, uint8_t maptype,
            SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
            SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
 
+       delta = 0;
+       sm_space = sm->sm_space;
        while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
                size = ss->ss_end - ss->ss_start;
                start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
 
-               sm->sm_space -= size;
+               delta += size;
                size >>= sm->sm_shift;
 
                while (size) {
@@ -479,7 +480,7 @@ space_map_sync(space_map_t *sm, uint8_t maptype,
                        start += run_len;
                        size -= run_len;
                }
-               kmem_free(ss, sizeof (*ss));
+               kmem_cache_free(space_seg_cache, ss);
        }
 
        if (entry != entry_map) {
@@ -491,8 +492,15 @@ space_map_sync(space_map_t *sm, uint8_t maptype,
                smo->smo_objsize += size;
        }
 
+       /*
+        * Ensure that the space_map's accounting wasn't changed
+        * while we were in the middle of writing it out.
+        */
+       VERIFY3U(sm->sm_space, ==, sm_space);
+
        zio_buf_free(entry_map, bufsize);
 
+       sm->sm_space -= delta;
        VERIFY3U(sm->sm_space, ==, 0);
 }
 
@@ -504,3 +512,131 @@ space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
        smo->smo_objsize = 0;
        smo->smo_alloc = 0;
 }
+
+/*
+ * Space map reference trees.
+ *
+ * A space map is a collection of integers.  Every integer is either
+ * in the map, or it's not.  A space map reference tree generalizes
+ * the idea: it allows its members to have arbitrary reference counts,
+ * as opposed to the implicit reference count of 0 or 1 in a space map.
+ * This representation comes in handy when computing the union or
+ * intersection of multiple space maps.  For example, the union of
+ * N space maps is the subset of the reference tree with refcnt >= 1.
+ * The intersection of N space maps is the subset with refcnt >= N.
+ *
+ * [It's very much like a Fourier transform.  Unions and intersections
+ * are hard to perform in the 'space map domain', so we convert the maps
+ * into the 'reference count domain', where it's trivial, then invert.]
+ *
+ * vdev_dtl_reassess() uses computations of this form to determine
+ * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
+ * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
+ * has an outage wherever refcnt >= vdev_children.
+ */
+static int
+space_map_ref_compare(const void *x1, const void *x2)
+{
+       const space_ref_t *sr1 = x1;
+       const space_ref_t *sr2 = x2;
+
+       if (sr1->sr_offset < sr2->sr_offset)
+               return (-1);
+       if (sr1->sr_offset > sr2->sr_offset)
+               return (1);
+
+       if (sr1 < sr2)
+               return (-1);
+       if (sr1 > sr2)
+               return (1);
+
+       return (0);
+}
+
+void
+space_map_ref_create(avl_tree_t *t)
+{
+       avl_create(t, space_map_ref_compare,
+           sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
+}
+
+void
+space_map_ref_destroy(avl_tree_t *t)
+{
+       space_ref_t *sr;
+       void *cookie = NULL;
+
+       while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
+               kmem_free(sr, sizeof (*sr));
+
+       avl_destroy(t);
+}
+
+static void
+space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
+{
+       space_ref_t *sr;
+
+       sr = kmem_alloc(sizeof (*sr), KM_PUSHPAGE);
+       sr->sr_offset = offset;
+       sr->sr_refcnt = refcnt;
+
+       avl_add(t, sr);
+}
+
+void
+space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
+       int64_t refcnt)
+{
+       space_map_ref_add_node(t, start, refcnt);
+       space_map_ref_add_node(t, end, -refcnt);
+}
+
+/*
+ * Convert (or add) a space map into a reference tree.
+ */
+void
+space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt)
+{
+       space_seg_t *ss;
+
+       ASSERT(MUTEX_HELD(sm->sm_lock));
+
+       for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+               space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt);
+}
+
+/*
+ * Convert a reference tree into a space map.  The space map will contain
+ * all members of the reference tree for which refcnt >= minref.
+ */
+void
+space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref)
+{
+       uint64_t start = -1ULL;
+       int64_t refcnt = 0;
+       space_ref_t *sr;
+
+       ASSERT(MUTEX_HELD(sm->sm_lock));
+
+       space_map_vacate(sm, NULL, NULL);
+
+       for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
+               refcnt += sr->sr_refcnt;
+               if (refcnt >= minref) {
+                       if (start == -1ULL) {
+                               start = sr->sr_offset;
+                       }
+               } else {
+                       if (start != -1ULL) {
+                               uint64_t end = sr->sr_offset;
+                               ASSERT(start <= end);
+                               if (end > start)
+                                       space_map_add(sm, start, end - start);
+                               start = -1ULL;
+                       }
+               }
+       }
+       ASSERT(refcnt == 0);
+       ASSERT(start == -1ULL);
+}