Illumos #3006

[zfs.git] / module / zfs / metaslab.c
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c

index 56c4610..cc51ea4 100644 (file)
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -20,6 +20,7 @@
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
   */
  
  #include <sys/zfs_context.h>
@@ -30,15 +31,34 @@
  #include <sys/vdev_impl.h>
  #include <sys/zio.h>
  
-#define WITH_NDF_BLOCK_ALLOCATOR
+#define WITH_DF_BLOCK_ALLOCATOR
+
+/*
+ * Allow allocations to switch to gang blocks quickly. We do this to
+ * avoid having to load lots of space_maps in a given txg. There are,
+ * however, some cases where we want to avoid "fast" ganging and instead
+ * we want to do an exhaustive search of all metaslabs on this device.
+ * Currently we don't allow any gang, zil, or dump device related allocations
+ * to "fast" gang.
+ */
+#define        CAN_FASTGANG(flags) \
+       (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
+       METASLAB_GANG_AVOID)))
  
  uint64_t metaslab_aliquot = 512ULL << 10;
  uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;    /* force gang blocks */
  
  /*
+ * This value defines the number of allowed allocation failures per vdev.
+ * If a device reaches this threshold in a given txg then we consider skipping
+ * allocations on that device.
+ */
+int zfs_mg_alloc_failures;
+
+/*
   * Metaslab debugging: when set, keeps all space maps in core to verify frees.
   */
-static int metaslab_debug = 0;
+int metaslab_debug = 0;
  
  /*
   * Minimum size which forces the dynamic allocator to change
@@ -82,11 +102,12 @@ metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
  {
         metaslab_class_t *mc;
  
-       mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+       mc = kmem_zalloc(sizeof (metaslab_class_t), KM_PUSHPAGE);
  
         mc->mc_spa = spa;
         mc->mc_rotor = NULL;
         mc->mc_ops = ops;
+       mutex_init(&mc->mc_fastwrite_lock, NULL, MUTEX_DEFAULT, NULL);
  
         return (mc);
  }
@@ -100,6 +121,7 @@ metaslab_class_destroy(metaslab_class_t *mc)
         ASSERT(mc->mc_space == 0);
         ASSERT(mc->mc_dspace == 0);
  
+       mutex_destroy(&mc->mc_fastwrite_lock);
         kmem_free(mc, sizeof (metaslab_class_t));
  }
  
@@ -197,7 +219,7 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
  {
         metaslab_group_t *mg;
  
-       mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
+       mg = kmem_zalloc(sizeof (metaslab_group_t), KM_PUSHPAGE);
         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
             sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
@@ -402,9 +424,9 @@ metaslab_pp_load(space_map_t *sm)
         space_seg_t *ss;
  
         ASSERT(sm->sm_ppd == NULL);
-       sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+       sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_PUSHPAGE);
  
-       sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+       sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_PUSHPAGE);
         avl_create(sm->sm_pp_root, metaslab_segsize_compare,
             sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
  
@@ -705,7 +727,7 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
         vdev_t *vd = mg->mg_vd;
         metaslab_t *msp;
  
-       msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+       msp = kmem_zalloc(sizeof (metaslab_t), KM_PUSHPAGE);
         mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
  
         msp->ms_smo_syncing = *smo;
@@ -770,7 +792,7 @@ metaslab_fini(metaslab_t *msp)
         for (t = 0; t < TXG_DEFER_SIZE; t++)
                 space_map_destroy(&msp->ms_defermap[t]);
  
-       ASSERT3S(msp->ms_deferspace, ==, 0);
+       ASSERT0(msp->ms_deferspace);
  
         mutex_exit(&msp->ms_lock);
         mutex_destroy(&msp->ms_lock);
@@ -865,7 +887,7 @@ metaslab_prefetch(metaslab_group_t *mg)
  }
  
  static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
  {
         metaslab_group_t *mg = msp->ms_group;
         space_map_t *sm = &msp->ms_map;
@@ -877,8 +899,9 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
                 space_map_load_wait(sm);
                 if (!sm->sm_loaded) {
-                       int error = space_map_load(sm, sm_ops, SM_FREE,
-                           &msp->ms_smo,
+                       space_map_obj_t *smo = &msp->ms_smo;
+
+                       int error = space_map_load(sm, sm_ops, SM_FREE, smo,
                             spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
                         if (error)  {
                                 metaslab_group_sort(msp->ms_group, msp, 0);
@@ -899,13 +922,6 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
                         mutex_exit(&mg->mg_lock);
                 }
  
-               /*
-                * If we were able to load the map then make sure
-                * that this map is still able to satisfy our request.
-                */
-               if (msp->ms_weight < size)
-                       return (ENOSPC);
-
                 metaslab_group_sort(msp->ms_group, msp,
                     msp->ms_weight | activation_weight);
         }
@@ -1123,6 +1139,7 @@ void
  metaslab_sync_reassess(metaslab_group_t *mg)
  {
         vdev_t *vd = mg->mg_vd;
+       int64_t failures = mg->mg_alloc_failures;
         int m;
  
         /*
@@ -1140,6 +1157,8 @@ metaslab_sync_reassess(metaslab_group_t *mg)
                 mutex_exit(&msp->ms_lock);
         }
  
+       atomic_add_64(&mg->mg_alloc_failures, -failures);
+
         /*
          * Prefetch the next potential metaslabs
          */
@@ -1164,9 +1183,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
  }
  
  static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
-    uint64_t min_distance, dva_t *dva, int d)
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
+    uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
  {
+       spa_t *spa = mg->mg_vd->vdev_spa;
         metaslab_t *msp = NULL;
         uint64_t offset = -1ULL;
         avl_tree_t *t = &mg->mg_metaslab_tree;
@@ -1187,11 +1207,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
  
                 mutex_enter(&mg->mg_lock);
                 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
-                       if (msp->ms_weight < size) {
+                       if (msp->ms_weight < asize) {
+                               spa_dbgmsg(spa, "%s: failed to meet weight "
+                                   "requirement: vdev %llu, txg %llu, mg %p, "
+                                   "msp %p, psize %llu, asize %llu, "
+                                   "failures %llu, weight %llu",
+                                   spa_name(spa), mg->mg_vd->vdev_id, txg,
+                                   mg, msp, psize, asize,
+                                   mg->mg_alloc_failures, msp->ms_weight);
                                 mutex_exit(&mg->mg_lock);
                                 return (-1ULL);
                         }
-
                         was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
                         if (activation_weight == METASLAB_WEIGHT_PRIMARY)
                                 break;
@@ -1210,6 +1236,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
                 if (msp == NULL)
                         return (-1ULL);
  
+               /*
+                * If we've already reached the allowable number of failed
+                * allocation attempts on this metaslab group then we
+                * consider skipping it. We skip it only if we're allowed
+                * to "fast" gang, the physical size is larger than
+                * a gang block, and we're attempting to allocate from
+                * the primary metaslab.
+                */
+               if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
+                   CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
+                   activation_weight == METASLAB_WEIGHT_PRIMARY) {
+                       spa_dbgmsg(spa, "%s: skipping metaslab group: "
+                           "vdev %llu, txg %llu, mg %p, psize %llu, "
+                           "asize %llu, failures %llu", spa_name(spa),
+                           mg->mg_vd->vdev_id, txg, mg, psize, asize,
+                           mg->mg_alloc_failures);
+                       return (-1ULL);
+               }
+
                 mutex_enter(&msp->ms_lock);
  
                 /*
@@ -1218,7 +1263,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
                  * another thread may have changed the weight while we
                  * were blocked on the metaslab lock.
                  */
-               if (msp->ms_weight < size || (was_active &&
+               if (msp->ms_weight < asize || (was_active &&
                     !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
                     activation_weight == METASLAB_WEIGHT_PRIMARY)) {
                         mutex_exit(&msp->ms_lock);
@@ -1233,14 +1278,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
                         continue;
                 }
  
-               if (metaslab_activate(msp, activation_weight, size) != 0) {
+               if (metaslab_activate(msp, activation_weight) != 0) {
                         mutex_exit(&msp->ms_lock);
                         continue;
                 }
  
-               if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
+               if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
                         break;
  
+               atomic_inc_64(&mg->mg_alloc_failures);
+
                 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
  
                 mutex_exit(&msp->ms_lock);
@@ -1249,7 +1296,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
         if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
                 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
  
-       space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+       space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
  
         mutex_exit(&msp->ms_lock);
  
@@ -1263,7 +1310,7 @@ static int
  metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
      dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
  {
-       metaslab_group_t *mg, *rotor;
+       metaslab_group_t *mg, *fast_mg, *rotor;
         vdev_t *vd;
         int dshift = 3;
         int all_zero;
@@ -1281,6 +1328,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
         if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
                 return (ENOSPC);
  
+       if (flags & METASLAB_FASTWRITE)
+               mutex_enter(&mc->mc_fastwrite_lock);
+
         /*
          * Start at the rotor and loop through all mgs until we find something.
          * Note that there's no locking on mc_rotor or mc_aliquot because
@@ -1323,6 +1373,15 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
         } else if (d != 0) {
                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
                 mg = vd->vdev_mg->mg_next;
+       } else if (flags & METASLAB_FASTWRITE) {
+               mg = fast_mg = mc->mc_rotor;
+
+               do {
+                       if (fast_mg->mg_vd->vdev_pending_fastwrite <
+                           mg->mg_vd->vdev_pending_fastwrite)
+                               mg = fast_mg;
+               } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
+
         } else {
                 mg = mc->mc_rotor;
         }
@@ -1376,7 +1435,8 @@ top:
                 asize = vdev_psize_to_asize(vd, psize);
                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
  
-               offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+               offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
+                   dva, d, flags);
                 if (offset != -1ULL) {
                         /*
                          * If we've just selected this metaslab group,
@@ -1388,21 +1448,28 @@ top:
                                 vdev_stat_t *vs = &vd->vdev_stat;
                                 int64_t vu, cu;
  
-                               /*
-                                * Determine percent used in units of 0..1024.
-                                * (This is just to avoid floating point.)
-                                */
-                               vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
-                               cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
+                               vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
+                               cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
  
                                 /*
-                                * Bias by at most +/- 25% of the aliquot.
+                                * Calculate how much more or less we should
+                                * try to allocate from this device during
+                                * this iteration around the rotor.
+                                * For example, if a device is 80% full
+                                * and the pool is 20% full then we should
+                                * reduce allocations by 60% on this device.
+                                *
+                                * mg_bias = (20 - 80) * 512K / 100 = -307K
+                                *
+                                * This reduces allocations by 307K for this
+                                * iteration.
                                  */
                                 mg->mg_bias = ((cu - vu) *
-                                   (int64_t)mg->mg_aliquot) / (1024 * 4);
+                                   (int64_t)mg->mg_aliquot) / 100;
                         }
  
-                       if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
+                       if ((flags & METASLAB_FASTWRITE) ||
+                           atomic_add_64_nv(&mc->mc_aliquot, asize) >=
                             mg->mg_aliquot + mg->mg_bias) {
                                 mc->mc_rotor = mg->mg_next;
                                 mc->mc_aliquot = 0;
@@ -1413,6 +1480,12 @@ top:
                         DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
                         DVA_SET_ASIZE(&dva[d], asize);
  
+                       if (flags & METASLAB_FASTWRITE) {
+                               atomic_add_64(&vd->vdev_pending_fastwrite,
+                                   psize);
+                               mutex_exit(&mc->mc_fastwrite_lock);
+                       }
+
                         return (0);
                 }
  next:
@@ -1434,6 +1507,8 @@ next:
  
         bzero(&dva[d], sizeof (dva_t));
  
+       if (flags & METASLAB_FASTWRITE)
+               mutex_exit(&mc->mc_fastwrite_lock);
         return (ENOSPC);
  }
  
@@ -1513,7 +1588,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
         mutex_enter(&msp->ms_lock);
  
         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
-               error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+               error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
  
         if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
                 error = ENOENT;
@@ -1627,3 +1702,53 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
  
         return (error);
  }
+
+void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
+{
+       const dva_t *dva = bp->blk_dva;
+       int ndvas = BP_GET_NDVAS(bp);
+       uint64_t psize = BP_GET_PSIZE(bp);
+       int d;
+       vdev_t *vd;
+
+       ASSERT(!BP_IS_HOLE(bp));
+       ASSERT(psize > 0);
+
+       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+       for (d = 0; d < ndvas; d++) {
+               if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
+                       continue;
+               atomic_add_64(&vd->vdev_pending_fastwrite, psize);
+       }
+
+       spa_config_exit(spa, SCL_VDEV, FTAG);
+}
+
+void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
+{
+       const dva_t *dva = bp->blk_dva;
+       int ndvas = BP_GET_NDVAS(bp);
+       uint64_t psize = BP_GET_PSIZE(bp);
+       int d;
+       vdev_t *vd;
+
+       ASSERT(!BP_IS_HOLE(bp));
+       ASSERT(psize > 0);
+
+       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+       for (d = 0; d < ndvas; d++) {
+               if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
+                       continue;
+               ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
+               atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
+       }
+
+       spa_config_exit(spa, SCL_VDEV, FTAG);
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(metaslab_debug, int, 0644);
+MODULE_PARM_DESC(metaslab_debug, "keep space maps in core to verify frees");
+#endif /* _KERNEL && HAVE_SPL */