+ * Determine if the in-core space map representation can be condensed on-disk.
+ * We would like to use the following criteria to make our decision:
+ *
+ * 1. The size of the space map object should not dramatically increase as a
+ * result of writing out our in-core free map.
+ *
+ * 2. The minimal on-disk space map representation is zfs_condense_pct/100
+ * times the size than the in-core representation (i.e. zfs_condense_pct = 110
+ * and in-core = 1MB, minimal = 1.1.MB).
+ *
+ * Checking the first condition is tricky since we don't want to walk
+ * the entire AVL tree calculating the estimated on-disk size. Instead we
+ * use the size-ordered AVL tree in the space map and calculate the
+ * size required for the largest segment in our in-core free map. If the
+ * size required to represent that segment on disk is larger than the space
+ * map object then we avoid condensing this map.
+ *
+ * To determine the second criterion we use a best-case estimate and assume
+ * each segment can be represented on-disk as a single 64-bit entry. We refer
+ * to this best-case estimate as the space map's minimal form.
+ */
+static boolean_t
+metaslab_should_condense(metaslab_t *msp)
+{
+ space_map_t *sm = msp->ms_map;
+ space_map_obj_t *smo = &msp->ms_smo_syncing;
+ space_seg_t *ss;
+ uint64_t size, entries, segsz;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(sm->sm_loaded);
+
+ /*
+ * Use the sm_pp_root AVL tree, which is ordered by size, to obtain
+ * the largest segment in the in-core free map. If the tree is
+ * empty then we should condense the map.
+ */
+ ss = avl_last(sm->sm_pp_root);
+ if (ss == NULL)
+ return (B_TRUE);
+
+ /*
+ * Calculate the number of 64-bit entries this segment would
+ * require when written to disk. If this single segment would be
+ * larger on-disk than the entire current on-disk structure, then
+ * clearly condensing will increase the on-disk structure size.
+ */
+ size = (ss->ss_end - ss->ss_start) >> sm->sm_shift;
+ entries = size / (MIN(size, SM_RUN_MAX));
+ segsz = entries * sizeof (uint64_t);
+
+ return (segsz <= smo->smo_objsize &&
+ smo->smo_objsize >= (zfs_condense_pct *
+ sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100);
+}
+
+/*
+ * Condense the on-disk space map representation to its minimized form.
+ * The minimized form consists of a small number of allocations followed by
+ * the in-core free map.
+ */
+static void
+metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK];
+ space_map_t condense_map;
+ space_map_t *sm = msp->ms_map;
+ objset_t *mos = spa_meta_objset(spa);
+ space_map_obj_t *smo = &msp->ms_smo_syncing;
+ int t;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT3U(spa_sync_pass(spa), ==, 1);
+ ASSERT(sm->sm_loaded);
+
+ spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
+ "smo size %llu, segments %lu", txg,
+ (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
+ smo->smo_objsize, avl_numnodes(&sm->sm_root));
+
+ /*
+ * Create an map that is a 100% allocated map. We remove segments
+ * that have been freed in this txg, any deferred frees that exist,
+ * and any allocation in the future. Removing segments should be
+ * a relatively inexpensive operation since we expect these maps to
+ * a small number of nodes.
+ */
+ space_map_create(&condense_map, sm->sm_start, sm->sm_size,
+ sm->sm_shift, sm->sm_lock);
+ space_map_add(&condense_map, condense_map.sm_start,
+ condense_map.sm_size);
+
+ /*
+ * Remove what's been freed in this txg from the condense_map.
+ * Since we're in sync_pass 1, we know that all the frees from
+ * this txg are in the freemap.
+ */
+ space_map_walk(freemap, space_map_remove, &condense_map);
+
+ for (t = 0; t < TXG_DEFER_SIZE; t++)
+ space_map_walk(msp->ms_defermap[t],
+ space_map_remove, &condense_map);
+
+ for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+ space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK],
+ space_map_remove, &condense_map);
+
+ /*
+ * We're about to drop the metaslab's lock thus allowing
+ * other consumers to change it's content. Set the
+ * space_map's sm_condensing flag to ensure that
+ * allocations on this metaslab do not occur while we're
+ * in the middle of committing it to disk. This is only critical
+ * for the ms_map as all other space_maps use per txg
+ * views of their content.
+ */
+ sm->sm_condensing = B_TRUE;
+
+ mutex_exit(&msp->ms_lock);
+ space_map_truncate(smo, mos, tx);
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * While we would ideally like to create a space_map representation
+ * that consists only of allocation records, doing so can be
+ * prohibitively expensive because the in-core free map can be
+ * large, and therefore computationally expensive to subtract
+ * from the condense_map. Instead we sync out two maps, a cheap
+ * allocation only map followed by the in-core free map. While not
+ * optimal, this is typically close to optimal, and much cheaper to
+ * compute.
+ */
+ space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx);
+ space_map_vacate(&condense_map, NULL, NULL);
+ space_map_destroy(&condense_map);
+
+ space_map_sync(sm, SM_FREE, smo, mos, tx);
+ sm->sm_condensing = B_FALSE;
+
+ spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, "
+ "smo size %llu", txg,
+ (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
+ smo->smo_objsize);
+}
+
+/*