zfs/lib/libumem/vmem.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /* #pragma ident        "@(#)vmem.c     1.10    05/06/08 SMI" */
  28
  29 /*
  30  * For a more complete description of the main ideas, see:
  31  *
  32  *      Jeff Bonwick and Jonathan Adams,
  33  *
  34  *      Magazines and vmem: Extending the Slab Allocator to Many CPUs and
  35  *      Arbitrary Resources.
  36  *
  37  *      Proceedings of the 2001 Usenix Conference.
  38  *      Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf.
  39  *
  40  * For the "Big Theory Statement", see usr/src/common/os/vmem.c
  41  *
  42  * 1. Overview of changes
  43  * ------------------------------
  44  * There have been a few changes to vmem in order to support umem.  The
  45  * main areas are:
  46  *
  47  *      * VM_SLEEP unsupported
  48  *
  49  *      * Reaping changes
  50  *
  51  *      * initialization changes
  52  *
  53  *      * _vmem_extend_alloc
  54  *
  55  *
  56  * 2. VM_SLEEP Removed
  57  * -------------------
  58  * Since VM_SLEEP allocations can hold locks (in vmem_populate()) for
  59  * possibly infinite amounts of time, they are not supported in this
  60  * version of vmem.  Sleep-like behavior can be achieved through
  61  * UMEM_NOFAIL umem allocations.
  62  *
  63  *
  64  * 3. Reaping changes
  65  * ------------------
  66  * Unlike kmem_reap(), which just asynchronously schedules work, umem_reap()
  67  * can do allocations and frees synchronously.  This is a problem if it
  68  * occurs during a vmem_populate() allocation.
  69  *
  70  * Instead, we delay reaps while populates are active.
  71  *
  72  *
  73  * 4. Initialization changes
  74  * -------------------------
  75  * In the kernel, vmem_init() allows you to create a single, top-level arena,
  76  * which has vmem_internal_arena as a child.  For umem, we want to be able
  77  * to extend arenas dynamically.  It is much easier to support this if we
  78  * allow a two-level "heap" arena:
  79  *
  80  *      +----------+
  81  *      |  "fake"  |
  82  *      +----------+
  83  *            |
  84  *      +----------+
  85  *      |  "heap"  |
  86  *      +----------+
  87  *        |    \ \
  88  *        |     +-+-- ... <other children>
  89  *        |
  90  *      +---------------+
  91  *      | vmem_internal |
  92  *      +---------------+
  93  *          | | | |
  94  *         <children>
  95  *
  96  * The new vmem_init() allows you to specify a "parent" of the heap, along
  97  * with allocation functions.
  98  *
  99  *
 100  * 5. _vmem_extend_alloc
 101  * ---------------------
 102  * The other part of extending is _vmem_extend_alloc.  This function allows
 103  * you to extend (expand current spans, if possible) an arena and allocate
 104  * a chunk of the newly extened span atomically.  This is needed to support
 105  * extending the heap while vmem_populate()ing it.
 106  *
 107  * In order to increase the usefulness of extending, non-imported spans are
 108  * sorted in address order.
 109  */
 110
 111 #include "config.h"
 112 /* #include "mtlib.h" */
 113 #include <sys/vmem_impl_user.h>
 114 #if HAVE_ALLOCA_H
 115 #include <alloca.h>
 116 #endif
 117 #ifdef HAVE_SYS_SYSMACROS_H
 118 #include <sys/sysmacros.h>
 119 #endif
 120 #include <stdio.h>
 121 #if HAVE_STRINGS_H
 122 #include <strings.h>
 123 #endif
 124 #if HAVE_ATOMIC_H
 125 #include <atomic.h>
 126 #endif
 127
 128 #include "vmem_base.h"
 129 #include "umem_base.h"
 130
 131 #define VMEM_INITIAL            6       /* early vmem arenas */
 132 #define VMEM_SEG_INITIAL        100     /* early segments */
 133
 134 /*
 135  * Adding a new span to an arena requires two segment structures: one to
 136  * represent the span, and one to represent the free segment it contains.
 137  */
 138 #define VMEM_SEGS_PER_SPAN_CREATE       2
 139
 140 /*
 141  * Allocating a piece of an existing segment requires 0-2 segment structures
 142  * depending on how much of the segment we're allocating.
 143  *
 144  * To allocate the entire segment, no new segment structures are needed; we
 145  * simply move the existing segment structure from the freelist to the
 146  * allocation hash table.
 147  *
 148  * To allocate a piece from the left or right end of the segment, we must
 149  * split the segment into two pieces (allocated part and remainder), so we
 150  * need one new segment structure to represent the remainder.
 151  *
 152  * To allocate from the middle of a segment, we need two new segment strucures
 153  * to represent the remainders on either side of the allocated part.
 154  */
 155 #define VMEM_SEGS_PER_EXACT_ALLOC       0
 156 #define VMEM_SEGS_PER_LEFT_ALLOC        1
 157 #define VMEM_SEGS_PER_RIGHT_ALLOC       1
 158 #define VMEM_SEGS_PER_MIDDLE_ALLOC      2
 159
 160 /*
 161  * vmem_populate() preallocates segment structures for vmem to do its work.
 162  * It must preallocate enough for the worst case, which is when we must import
 163  * a new span and then allocate from the middle of it.
 164  */
 165 #define VMEM_SEGS_PER_ALLOC_MAX         \
 166         (VMEM_SEGS_PER_SPAN_CREATE + VMEM_SEGS_PER_MIDDLE_ALLOC)
 167
 168 /*
 169  * The segment structures themselves are allocated from vmem_seg_arena, so
 170  * we have a recursion problem when vmem_seg_arena needs to populate itself.
 171  * We address this by working out the maximum number of segment structures
 172  * this act will require, and multiplying by the maximum number of threads
 173  * that we'll allow to do it simultaneously.
 174  *
 175  * The worst-case segment consumption to populate vmem_seg_arena is as
 176  * follows (depicted as a stack trace to indicate why events are occurring):
 177  *
 178  * vmem_alloc(vmem_seg_arena)           -> 2 segs (span create + exact alloc)
 179  *  vmem_alloc(vmem_internal_arena)     -> 2 segs (span create + exact alloc)
 180  *   heap_alloc(heap_arena)
 181  *    vmem_alloc(heap_arena)            -> 4 seg (span create + alloc)
 182  *     parent_alloc(parent_arena)
 183  *      _vmem_extend_alloc(parent_arena) -> 3 seg (span create + left alloc)
 184  *
 185  * Note:  The reservation for heap_arena must be 4, since vmem_xalloc()
 186  * is overly pessimistic on allocations where parent_arena has a stricter
 187  * alignment than heap_arena.
 188  *
 189  * The worst-case consumption for any arena is 4 segment structures.
 190  * For now, we only support VM_NOSLEEP allocations, so as long as we
 191  * serialize all vmem_populates, a 4-seg reserve is sufficient.
 192  */
 193 #define VMEM_POPULATE_SEGS_PER_ARENA    4
 194 #define VMEM_POPULATE_LOCKS             1
 195
 196 #define VMEM_POPULATE_RESERVE           \
 197         (VMEM_POPULATE_SEGS_PER_ARENA * VMEM_POPULATE_LOCKS)
 198
 199 /*
 200  * vmem_populate() ensures that each arena has VMEM_MINFREE seg structures
 201  * so that it can satisfy the worst-case allocation *and* participate in
 202  * worst-case allocation from vmem_seg_arena.
 203  */
 204 #define VMEM_MINFREE    (VMEM_POPULATE_RESERVE + VMEM_SEGS_PER_ALLOC_MAX)
 205
 206 /* Don't assume new statics are zeroed - see vmem_startup() */
 207 static vmem_t vmem0[VMEM_INITIAL];
 208 static vmem_t *vmem_populator[VMEM_INITIAL];
 209 static uint32_t vmem_id;
 210 static uint32_t vmem_populators;
 211 static vmem_seg_t vmem_seg0[VMEM_SEG_INITIAL];
 212 static vmem_seg_t *vmem_segfree;
 213 static mutex_t vmem_list_lock = DEFAULTMUTEX;
 214 static mutex_t vmem_segfree_lock = DEFAULTMUTEX;
 215 static vmem_populate_lock_t vmem_nosleep_lock = {
 216   DEFAULTMUTEX,
 217   0
 218 };
 219 #define IN_POPULATE()   (vmem_nosleep_lock.vmpl_thr == thr_self())
 220 static vmem_t *vmem_list;
 221 static vmem_t *vmem_internal_arena;
 222 static vmem_t *vmem_seg_arena;
 223 static vmem_t *vmem_hash_arena;
 224 static vmem_t *vmem_vmem_arena;
 225
 226 vmem_t *vmem_heap;
 227 vmem_alloc_t *vmem_heap_alloc;
 228 vmem_free_t *vmem_heap_free;
 229
 230 uint32_t vmem_mtbf;             /* mean time between failures [default: off] */
 231 size_t vmem_seg_size = sizeof (vmem_seg_t);
 232
 233 /*
 234  * we use the _ version, since we don't want to be cancelled.
 235  * Actually, this is automatically taken care of by including "mtlib.h".
 236  */
 237 extern int _cond_wait(cond_t *cv, mutex_t *mutex);
 238
 239 /*
 240  * Insert/delete from arena list (type 'a') or next-of-kin list (type 'k').
 241  */
 242 #define VMEM_INSERT(vprev, vsp, type)                                   \
 243 {                                                                       \
 244         vmem_seg_t *vnext = (vprev)->vs_##type##next;                   \
 245         (vsp)->vs_##type##next = (vnext);                               \
 246         (vsp)->vs_##type##prev = (vprev);                               \
 247         (vprev)->vs_##type##next = (vsp);                               \
 248         (vnext)->vs_##type##prev = (vsp);                               \
 249 }
 250
 251 #define VMEM_DELETE(vsp, type)                                          \
 252 {                                                                       \
 253         vmem_seg_t *vprev = (vsp)->vs_##type##prev;                     \
 254         vmem_seg_t *vnext = (vsp)->vs_##type##next;                     \
 255         (vprev)->vs_##type##next = (vnext);                             \
 256         (vnext)->vs_##type##prev = (vprev);                             \
 257 }
 258
 259 /*
 260  * Get a vmem_seg_t from the global segfree list.
 261  */
 262 static vmem_seg_t *
 263 vmem_getseg_global(void)
 264 {
 265         vmem_seg_t *vsp;
 266
 267         (void) mutex_lock(&vmem_segfree_lock);
 268         if ((vsp = vmem_segfree) != NULL)
 269                 vmem_segfree = vsp->vs_knext;
 270         (void) mutex_unlock(&vmem_segfree_lock);
 271
 272         return (vsp);
 273 }
 274
 275 /*
 276  * Put a vmem_seg_t on the global segfree list.
 277  */
 278 static void
 279 vmem_putseg_global(vmem_seg_t *vsp)
 280 {
 281         (void) mutex_lock(&vmem_segfree_lock);
 282         vsp->vs_knext = vmem_segfree;
 283         vmem_segfree = vsp;
 284         (void) mutex_unlock(&vmem_segfree_lock);
 285 }
 286
 287 /*
 288  * Get a vmem_seg_t from vmp's segfree list.
 289  */
 290 static vmem_seg_t *
 291 vmem_getseg(vmem_t *vmp)
 292 {
 293         vmem_seg_t *vsp;
 294
 295         ASSERT(vmp->vm_nsegfree > 0);
 296
 297         vsp = vmp->vm_segfree;
 298         vmp->vm_segfree = vsp->vs_knext;
 299         vmp->vm_nsegfree--;
 300
 301         return (vsp);
 302 }
 303
 304 /*
 305  * Put a vmem_seg_t on vmp's segfree list.
 306  */
 307 static void
 308 vmem_putseg(vmem_t *vmp, vmem_seg_t *vsp)
 309 {
 310         vsp->vs_knext = vmp->vm_segfree;
 311         vmp->vm_segfree = vsp;
 312         vmp->vm_nsegfree++;
 313 }
 314
 315 /*
 316  * Add vsp to the appropriate freelist.
 317  */
 318 static void
 319 vmem_freelist_insert(vmem_t *vmp, vmem_seg_t *vsp)
 320 {
 321         vmem_seg_t *vprev;
 322
 323         ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp);
 324
 325         vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1];
 326         vsp->vs_type = VMEM_FREE;
 327         vmp->vm_freemap |= VS_SIZE(vprev);
 328         VMEM_INSERT(vprev, vsp, k);
 329
 330         (void) cond_broadcast(&vmp->vm_cv);
 331 }
 332
 333 /*
 334  * Take vsp from the freelist.
 335  */
 336 static void
 337 vmem_freelist_delete(vmem_t *vmp, vmem_seg_t *vsp)
 338 {
 339         ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp);
 340         ASSERT(vsp->vs_type == VMEM_FREE);
 341
 342         if (vsp->vs_knext->vs_start == 0 && vsp->vs_kprev->vs_start == 0) {
 343                 /*
 344                  * The segments on both sides of 'vsp' are freelist heads,
 345                  * so taking vsp leaves the freelist at vsp->vs_kprev empty.
 346                  */
 347                 ASSERT(vmp->vm_freemap & VS_SIZE(vsp->vs_kprev));
 348                 vmp->vm_freemap ^= VS_SIZE(vsp->vs_kprev);
 349         }
 350         VMEM_DELETE(vsp, k);
 351 }
 352
 353 /*
 354  * Add vsp to the allocated-segment hash table and update kstats.
 355  */
 356 static void
 357 vmem_hash_insert(vmem_t *vmp, vmem_seg_t *vsp)
 358 {
 359         vmem_seg_t **bucket;
 360
 361         vsp->vs_type = VMEM_ALLOC;
 362         bucket = VMEM_HASH(vmp, vsp->vs_start);
 363         vsp->vs_knext = *bucket;
 364         *bucket = vsp;
 365
 366         if (vmem_seg_size == sizeof (vmem_seg_t)) {
 367                 vsp->vs_depth = (uint8_t)getpcstack(vsp->vs_stack,
 368                     VMEM_STACK_DEPTH, 0);
 369                 vsp->vs_thread = thr_self();
 370                 vsp->vs_timestamp = gethrtime();
 371         } else {
 372                 vsp->vs_depth = 0;
 373         }
 374
 375         vmp->vm_kstat.vk_alloc++;
 376         vmp->vm_kstat.vk_mem_inuse += VS_SIZE(vsp);
 377 }
 378
 379 /*
 380  * Remove vsp from the allocated-segment hash table and update kstats.
 381  */
 382 static vmem_seg_t *
 383 vmem_hash_delete(vmem_t *vmp, uintptr_t addr, size_t size)
 384 {
 385         vmem_seg_t *vsp, **prev_vspp;
 386
 387         prev_vspp = VMEM_HASH(vmp, addr);
 388         while ((vsp = *prev_vspp) != NULL) {
 389                 if (vsp->vs_start == addr) {
 390                         *prev_vspp = vsp->vs_knext;
 391                         break;
 392                 }
 393                 vmp->vm_kstat.vk_lookup++;
 394                 prev_vspp = &vsp->vs_knext;
 395         }
 396
 397         if (vsp == NULL) {
 398                 umem_panic("vmem_hash_delete(%p, %lx, %lu): bad free",
 399                     vmp, addr, size);
 400         }
 401         if (VS_SIZE(vsp) != size) {
 402                 umem_panic("vmem_hash_delete(%p, %lx, %lu): wrong size "
 403                     "(expect %lu)", vmp, addr, size, VS_SIZE(vsp));
 404         }
 405
 406         vmp->vm_kstat.vk_free++;
 407         vmp->vm_kstat.vk_mem_inuse -= size;
 408
 409         return (vsp);
 410 }
 411
 412 /*
 413  * Create a segment spanning the range [start, end) and add it to the arena.
 414  */
 415 static vmem_seg_t *
 416 vmem_seg_create(vmem_t *vmp, vmem_seg_t *vprev, uintptr_t start, uintptr_t end)
 417 {
 418         vmem_seg_t *newseg = vmem_getseg(vmp);
 419
 420         newseg->vs_start = start;
 421         newseg->vs_end = end;
 422         newseg->vs_type = 0;
 423         newseg->vs_import = 0;
 424
 425         VMEM_INSERT(vprev, newseg, a);
 426
 427         return (newseg);
 428 }
 429
 430 /*
 431  * Remove segment vsp from the arena.
 432  */
 433 static void
 434 vmem_seg_destroy(vmem_t *vmp, vmem_seg_t *vsp)
 435 {
 436         ASSERT(vsp->vs_type != VMEM_ROTOR);
 437         VMEM_DELETE(vsp, a);
 438
 439         vmem_putseg(vmp, vsp);
 440 }
 441
 442 /*
 443  * Add the span [vaddr, vaddr + size) to vmp and update kstats.
 444  */
 445 static vmem_seg_t *
 446 vmem_span_create(vmem_t *vmp, void *vaddr, size_t size, uint8_t import)
 447 {
 448         vmem_seg_t *knext;
 449         vmem_seg_t *newseg, *span;
 450         uintptr_t start = (uintptr_t)vaddr;
 451         uintptr_t end = start + size;
 452
 453         knext = &vmp->vm_seg0;
 454         if (!import && vmp->vm_source_alloc == NULL) {
 455                 vmem_seg_t *kend, *kprev;
 456                 /*
 457                  * non-imported spans are sorted in address order.  This
 458                  * makes vmem_extend_unlocked() much more effective.
 459                  *
 460                  * We search in reverse order, since new spans are
 461                  * generally at higher addresses.
 462                  */
 463                 kend = &vmp->vm_seg0;
 464                 for (kprev = kend->vs_kprev; kprev != kend;
 465                     kprev = kprev->vs_kprev) {
 466                         if (!kprev->vs_import && (kprev->vs_end - 1) < start)
 467                                 break;
 468                 }
 469                 knext = kprev->vs_knext;
 470         }
 471
 472         ASSERT(MUTEX_HELD(&vmp->vm_lock));
 473
 474         if ((start | end) & (vmp->vm_quantum - 1)) {
 475                 umem_panic("vmem_span_create(%p, %p, %lu): misaligned",
 476                     vmp, vaddr, size);
 477         }
 478
 479         span = vmem_seg_create(vmp, knext->vs_aprev, start, end);
 480         span->vs_type = VMEM_SPAN;
 481         VMEM_INSERT(knext->vs_kprev, span, k);
 482
 483         newseg = vmem_seg_create(vmp, span, start, end);
 484         vmem_freelist_insert(vmp, newseg);
 485
 486         newseg->vs_import = import;
 487         if (import)
 488                 vmp->vm_kstat.vk_mem_import += size;
 489         vmp->vm_kstat.vk_mem_total += size;
 490
 491         return (newseg);
 492 }
 493
 494 /*
 495  * Remove span vsp from vmp and update kstats.
 496  */
 497 static void
 498 vmem_span_destroy(vmem_t *vmp, vmem_seg_t *vsp)
 499 {
 500         vmem_seg_t *span = vsp->vs_aprev;
 501         size_t size = VS_SIZE(vsp);
 502
 503         ASSERT(MUTEX_HELD(&vmp->vm_lock));
 504         ASSERT(span->vs_type == VMEM_SPAN);
 505
 506         if (vsp->vs_import)
 507                 vmp->vm_kstat.vk_mem_import -= size;
 508         vmp->vm_kstat.vk_mem_total -= size;
 509
 510         VMEM_DELETE(span, k);
 511
 512         vmem_seg_destroy(vmp, vsp);
 513         vmem_seg_destroy(vmp, span);
 514 }
 515
 516 /*
 517  * Allocate the subrange [addr, addr + size) from segment vsp.
 518  * If there are leftovers on either side, place them on the freelist.
 519  * Returns a pointer to the segment representing [addr, addr + size).
 520  */
 521 static vmem_seg_t *
 522 vmem_seg_alloc(vmem_t *vmp, vmem_seg_t *vsp, uintptr_t addr, size_t size)
 523 {
 524         uintptr_t vs_start = vsp->vs_start;
 525         uintptr_t vs_end = vsp->vs_end;
 526         size_t vs_size = vs_end - vs_start;
 527         size_t realsize = P2ROUNDUP(size, vmp->vm_quantum);
 528         uintptr_t addr_end = addr + realsize;
 529
 530         ASSERT(P2PHASE(vs_start, vmp->vm_quantum) == 0);
 531         ASSERT(P2PHASE(addr, vmp->vm_quantum) == 0);
 532         ASSERT(vsp->vs_type == VMEM_FREE);
 533         ASSERT(addr >= vs_start && addr_end - 1 <= vs_end - 1);
 534         ASSERT(addr - 1 <= addr_end - 1);
 535
 536         /*
 537          * If we're allocating from the start of the segment, and the
 538          * remainder will be on the same freelist, we can save quite
 539          * a bit of work.
 540          */
 541         if (P2SAMEHIGHBIT(vs_size, vs_size - realsize) && addr == vs_start) {
 542                 ASSERT(highbit(vs_size) == highbit(vs_size - realsize));
 543                 vsp->vs_start = addr_end;
 544                 vsp = vmem_seg_create(vmp, vsp->vs_aprev, addr, addr + size);
 545                 vmem_hash_insert(vmp, vsp);
 546                 return (vsp);
 547         }
 548
 549         vmem_freelist_delete(vmp, vsp);
 550
 551         if (vs_end != addr_end)
 552                 vmem_freelist_insert(vmp,
 553                     vmem_seg_create(vmp, vsp, addr_end, vs_end));
 554
 555         if (vs_start != addr)
 556                 vmem_freelist_insert(vmp,
 557                     vmem_seg_create(vmp, vsp->vs_aprev, vs_start, addr));
 558
 559         vsp->vs_start = addr;
 560         vsp->vs_end = addr + size;
 561
 562         vmem_hash_insert(vmp, vsp);
 563         return (vsp);
 564 }
 565
 566 /*
 567  * We cannot reap if we are in the middle of a vmem_populate().
 568  */
 569 void
 570 vmem_reap(void)
 571 {
 572         if (!IN_POPULATE())
 573                 umem_reap();
 574 }
 575
 576 /*
 577  * Populate vmp's segfree list with VMEM_MINFREE vmem_seg_t structures.
 578  */
 579 static int
 580 vmem_populate(vmem_t *vmp, int vmflag)
 581 {
 582         char *p;
 583         vmem_seg_t *vsp;
 584         ssize_t nseg;
 585         size_t size;
 586         vmem_populate_lock_t *lp;
 587         int i;
 588
 589         while (vmp->vm_nsegfree < VMEM_MINFREE &&
 590             (vsp = vmem_getseg_global()) != NULL)
 591                 vmem_putseg(vmp, vsp);
 592
 593         if (vmp->vm_nsegfree >= VMEM_MINFREE)
 594                 return (1);
 595
 596         /*
 597          * If we're already populating, tap the reserve.
 598          */
 599         if (vmem_nosleep_lock.vmpl_thr == thr_self()) {
 600                 ASSERT(vmp->vm_cflags & VMC_POPULATOR);
 601                 return (1);
 602         }
 603
 604         (void) mutex_unlock(&vmp->vm_lock);
 605
 606         ASSERT(vmflag & VM_NOSLEEP);    /* we do not allow sleep allocations */
 607         lp = &vmem_nosleep_lock;
 608
 609         /*
 610          * Cannot be just a mutex_lock(), since that has no effect if
 611          * libthread is not linked.
 612          */
 613         (void) mutex_lock(&lp->vmpl_mutex);
 614         ASSERT(lp->vmpl_thr == 0);
 615         lp->vmpl_thr = thr_self();
 616
 617         nseg = VMEM_MINFREE + vmem_populators * VMEM_POPULATE_RESERVE;
 618         size = P2ROUNDUP(nseg * vmem_seg_size, vmem_seg_arena->vm_quantum);
 619         nseg = size / vmem_seg_size;
 620
 621         /*
 622          * The following vmem_alloc() may need to populate vmem_seg_arena
 623          * and all the things it imports from.  When doing so, it will tap
 624          * each arena's reserve to prevent recursion (see the block comment
 625          * above the definition of VMEM_POPULATE_RESERVE).
 626          *
 627          * During this allocation, vmem_reap() is a no-op.  If the allocation
 628          * fails, we call vmem_reap() after dropping the population lock.
 629          */
 630         p = vmem_alloc(vmem_seg_arena, size, vmflag & VM_UMFLAGS);
 631         if (p == NULL) {
 632                 lp->vmpl_thr = 0;
 633                 (void) mutex_unlock(&lp->vmpl_mutex);
 634                 vmem_reap();
 635
 636                 (void) mutex_lock(&vmp->vm_lock);
 637                 vmp->vm_kstat.vk_populate_fail++;
 638                 return (0);
 639         }
 640         /*
 641          * Restock the arenas that may have been depleted during population.
 642          */
 643         for (i = 0; i < vmem_populators; i++) {
 644                 (void) mutex_lock(&vmem_populator[i]->vm_lock);
 645                 while (vmem_populator[i]->vm_nsegfree < VMEM_POPULATE_RESERVE)
 646                         vmem_putseg(vmem_populator[i],
 647                             (vmem_seg_t *)(p + --nseg * vmem_seg_size));
 648                 (void) mutex_unlock(&vmem_populator[i]->vm_lock);
 649         }
 650
 651         lp->vmpl_thr = 0;
 652         (void) mutex_unlock(&lp->vmpl_mutex);
 653         (void) mutex_lock(&vmp->vm_lock);
 654
 655         /*
 656          * Now take our own segments.
 657          */
 658         ASSERT(nseg >= VMEM_MINFREE);
 659         while (vmp->vm_nsegfree < VMEM_MINFREE)
 660                 vmem_putseg(vmp, (vmem_seg_t *)(p + --nseg * vmem_seg_size));
 661
 662         /*
 663          * Give the remainder to charity.
 664          */
 665         while (nseg > 0)
 666                 vmem_putseg_global((vmem_seg_t *)(p + --nseg * vmem_seg_size));
 667
 668         return (1);
 669 }
 670
 671 /*
 672  * Advance a walker from its previous position to 'afterme'.
 673  * Note: may drop and reacquire vmp->vm_lock.
 674  */
 675 static void
 676 vmem_advance(vmem_t *vmp, vmem_seg_t *walker, vmem_seg_t *afterme)
 677 {
 678         vmem_seg_t *vprev = walker->vs_aprev;
 679         vmem_seg_t *vnext = walker->vs_anext;
 680         vmem_seg_t *vsp = NULL;
 681
 682         VMEM_DELETE(walker, a);
 683
 684         if (afterme != NULL)
 685                 VMEM_INSERT(afterme, walker, a);
 686
 687         /*
 688          * The walker segment's presence may have prevented its neighbors
 689          * from coalescing.  If so, coalesce them now.
 690          */
 691         if (vprev->vs_type == VMEM_FREE) {
 692                 if (vnext->vs_type == VMEM_FREE) {
 693                         ASSERT(vprev->vs_end == vnext->vs_start);
 694                         vmem_freelist_delete(vmp, vnext);
 695                         vmem_freelist_delete(vmp, vprev);
 696                         vprev->vs_end = vnext->vs_end;
 697                         vmem_freelist_insert(vmp, vprev);
 698                         vmem_seg_destroy(vmp, vnext);
 699                 }
 700                 vsp = vprev;
 701         } else if (vnext->vs_type == VMEM_FREE) {
 702                 vsp = vnext;
 703         }
 704
 705         /*
 706          * vsp could represent a complete imported span,
 707          * in which case we must return it to the source.
 708          */
 709         if (vsp != NULL && vsp->vs_import && vmp->vm_source_free != NULL &&
 710             vsp->vs_aprev->vs_type == VMEM_SPAN &&
 711             vsp->vs_anext->vs_type == VMEM_SPAN) {
 712                 void *vaddr = (void *)vsp->vs_start;
 713                 size_t size = VS_SIZE(vsp);
 714                 ASSERT(size == VS_SIZE(vsp->vs_aprev));
 715                 vmem_freelist_delete(vmp, vsp);
 716                 vmem_span_destroy(vmp, vsp);
 717                 (void) mutex_unlock(&vmp->vm_lock);
 718                 vmp->vm_source_free(vmp->vm_source, vaddr, size);
 719                 (void) mutex_lock(&vmp->vm_lock);
 720         }
 721 }
 722
 723 /*
 724  * VM_NEXTFIT allocations deliberately cycle through all virtual addresses
 725  * in an arena, so that we avoid reusing addresses for as long as possible.
 726  * This helps to catch used-after-freed bugs.  It's also the perfect policy
 727  * for allocating things like process IDs, where we want to cycle through
 728  * all values in order.
 729  */
 730 static void *
 731 vmem_nextfit_alloc(vmem_t *vmp, size_t size, int vmflag)
 732 {
 733         vmem_seg_t *vsp, *rotor;
 734         uintptr_t addr;
 735         size_t realsize = P2ROUNDUP(size, vmp->vm_quantum);
 736         size_t vs_size;
 737
 738         (void) mutex_lock(&vmp->vm_lock);
 739
 740         if (vmp->vm_nsegfree < VMEM_MINFREE && !vmem_populate(vmp, vmflag)) {
 741                 (void) mutex_unlock(&vmp->vm_lock);
 742                 return (NULL);
 743         }
 744
 745         /*
 746          * The common case is that the segment right after the rotor is free,
 747          * and large enough that extracting 'size' bytes won't change which
 748          * freelist it's on.  In this case we can avoid a *lot* of work.
 749          * Instead of the normal vmem_seg_alloc(), we just advance the start
 750          * address of the victim segment.  Instead of moving the rotor, we
 751          * create the new segment structure *behind the rotor*, which has
 752          * the same effect.  And finally, we know we don't have to coalesce
 753          * the rotor's neighbors because the new segment lies between them.
 754          */
 755         rotor = &vmp->vm_rotor;
 756         vsp = rotor->vs_anext;
 757         if (vsp->vs_type == VMEM_FREE && (vs_size = VS_SIZE(vsp)) > realsize &&
 758             P2SAMEHIGHBIT(vs_size, vs_size - realsize)) {
 759                 ASSERT(highbit(vs_size) == highbit(vs_size - realsize));
 760                 addr = vsp->vs_start;
 761                 vsp->vs_start = addr + realsize;
 762                 vmem_hash_insert(vmp,
 763                     vmem_seg_create(vmp, rotor->vs_aprev, addr, addr + size));
 764                 (void) mutex_unlock(&vmp->vm_lock);
 765                 return ((void *)addr);
 766         }
 767
 768         /*
 769          * Starting at the rotor, look for a segment large enough to
 770          * satisfy the allocation.
 771          */
 772         for (;;) {
 773                 vmp->vm_kstat.vk_search++;
 774                 if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size)
 775                         break;
 776                 vsp = vsp->vs_anext;
 777                 if (vsp == rotor) {
 778                         /*
 779                          * We've come full circle.  One possibility is that the
 780                          * there's actually enough space, but the rotor itself
 781                          * is preventing the allocation from succeeding because
 782                          * it's sitting between two free segments.  Therefore,
 783                          * we advance the rotor and see if that liberates a
 784                          * suitable segment.
 785                          */
 786                         vmem_advance(vmp, rotor, rotor->vs_anext);
 787                         vsp = rotor->vs_aprev;
 788                         if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size)
 789                                 break;
 790                         /*
 791                          * If there's a lower arena we can import from, or it's
 792                          * a VM_NOSLEEP allocation, let vmem_xalloc() handle it.
 793                          * Otherwise, wait until another thread frees something.
 794                          */
 795                         if (vmp->vm_source_alloc != NULL ||
 796                             (vmflag & VM_NOSLEEP)) {
 797                                 (void) mutex_unlock(&vmp->vm_lock);
 798                                 return (vmem_xalloc(vmp, size, vmp->vm_quantum,
 799                                     0, 0, NULL, NULL, vmflag & VM_UMFLAGS));
 800                         }
 801                         vmp->vm_kstat.vk_wait++;
 802                         (void) _cond_wait(&vmp->vm_cv, &vmp->vm_lock);
 803                         vsp = rotor->vs_anext;
 804                 }
 805         }
 806
 807         /*
 808          * We found a segment.  Extract enough space to satisfy the allocation.
 809          */
 810         addr = vsp->vs_start;
 811         vsp = vmem_seg_alloc(vmp, vsp, addr, size);
 812         ASSERT(vsp->vs_type == VMEM_ALLOC &&
 813             vsp->vs_start == addr && vsp->vs_end == addr + size);
 814
 815         /*
 816          * Advance the rotor to right after the newly-allocated segment.
 817          * That's where the next VM_NEXTFIT allocation will begin searching.
 818          */
 819         vmem_advance(vmp, rotor, vsp);
 820         (void) mutex_unlock(&vmp->vm_lock);
 821         return ((void *)addr);
 822 }
 823
 824 /*
 825  * Allocate size bytes at offset phase from an align boundary such that the
 826  * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr)
 827  * that does not straddle a nocross-aligned boundary.
 828  */
 829 void *
 830 vmem_xalloc(vmem_t *vmp, size_t size, size_t align, size_t phase,
 831         size_t nocross, void *minaddr, void *maxaddr, int vmflag)
 832 {
 833         vmem_seg_t *vsp;
 834         vmem_seg_t *vbest = NULL;
 835         uintptr_t addr, taddr, start, end;
 836         void *vaddr;
 837         int hb, flist, resv;
 838         uint32_t mtbf;
 839
 840         if (phase > 0 && phase >= align)
 841                 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): "
 842                     "invalid phase",
 843                     (void *)vmp, size, align, phase, nocross,
 844                     minaddr, maxaddr, vmflag);
 845
 846         if (align == 0)
 847                 align = vmp->vm_quantum;
 848
 849         if ((align | phase | nocross) & (vmp->vm_quantum - 1)) {
 850                 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): "
 851                     "parameters not vm_quantum aligned",
 852                     (void *)vmp, size, align, phase, nocross,
 853                     minaddr, maxaddr, vmflag);
 854         }
 855
 856         if (nocross != 0 &&
 857             (align > nocross || P2ROUNDUP(phase + size, align) > nocross)) {
 858                 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): "
 859                     "overconstrained allocation",
 860                     (void *)vmp, size, align, phase, nocross,
 861                     minaddr, maxaddr, vmflag);
 862         }
 863
 864         if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 &&
 865             (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP)
 866                 return (NULL);
 867
 868         (void) mutex_lock(&vmp->vm_lock);
 869         for (;;) {
 870                 if (vmp->vm_nsegfree < VMEM_MINFREE &&
 871                     !vmem_populate(vmp, vmflag))
 872                         break;
 873
 874                 /*
 875                  * highbit() returns the highest bit + 1, which is exactly
 876                  * what we want: we want to search the first freelist whose
 877                  * members are *definitely* large enough to satisfy our
 878                  * allocation.  However, there are certain cases in which we
 879                  * want to look at the next-smallest freelist (which *might*
 880                  * be able to satisfy the allocation):
 881                  *
 882                  * (1)  The size is exactly a power of 2, in which case
 883                  *      the smaller freelist is always big enough;
 884                  *
 885                  * (2)  All other freelists are empty;
 886                  *
 887                  * (3)  We're in the highest possible freelist, which is
 888                  *      always empty (e.g. the 4GB freelist on 32-bit systems);
 889                  *
 890                  * (4)  We're doing a best-fit or first-fit allocation.
 891                  */
 892                 if ((size & (size - 1)) == 0) {
 893                         flist = lowbit(P2ALIGN(vmp->vm_freemap, size));
 894                 } else {
 895                         hb = highbit(size);
 896                         if ((vmp->vm_freemap >> hb) == 0 ||
 897                             hb == VMEM_FREELISTS ||
 898                             (vmflag & (VM_BESTFIT | VM_FIRSTFIT)))
 899                                 hb--;
 900                         flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb));
 901                 }
 902
 903                 for (vbest = NULL, vsp = (flist == 0) ? NULL :
 904                     vmp->vm_freelist[flist - 1].vs_knext;
 905                     vsp != NULL; vsp = vsp->vs_knext) {
 906                         vmp->vm_kstat.vk_search++;
 907                         if (vsp->vs_start == 0) {
 908                                 /*
 909                                  * We're moving up to a larger freelist,
 910                                  * so if we've already found a candidate,
 911                                  * the fit can't possibly get any better.
 912                                  */
 913                                 if (vbest != NULL)
 914                                         break;
 915                                 /*
 916                                  * Find the next non-empty freelist.
 917                                  */
 918                                 flist = lowbit(P2ALIGN(vmp->vm_freemap,
 919                                     VS_SIZE(vsp)));
 920                                 if (flist-- == 0)
 921                                         break;
 922                                 vsp = (vmem_seg_t *)&vmp->vm_freelist[flist];
 923                                 ASSERT(vsp->vs_knext->vs_type == VMEM_FREE);
 924                                 continue;
 925                         }
 926                         if (vsp->vs_end - 1 < (uintptr_t)minaddr)
 927                                 continue;
 928                         if (vsp->vs_start > (uintptr_t)maxaddr - 1)
 929                                 continue;
 930                         start = MAX(vsp->vs_start, (uintptr_t)minaddr);
 931                         end = MIN(vsp->vs_end - 1, (uintptr_t)maxaddr - 1) + 1;
 932                         taddr = P2PHASEUP(start, align, phase);
 933                         if (P2CROSS(taddr, taddr + size - 1, nocross))
 934                                 taddr +=
 935                                     P2ROUNDUP(P2NPHASE(taddr, nocross), align);
 936                         if ((taddr - start) + size > end - start ||
 937                             (vbest != NULL && VS_SIZE(vsp) >= VS_SIZE(vbest)))
 938                                 continue;
 939                         vbest = vsp;
 940                         addr = taddr;
 941                         if (!(vmflag & VM_BESTFIT) || VS_SIZE(vbest) == size)
 942                                 break;
 943                 }
 944                 if (vbest != NULL)
 945                         break;
 946                 if (size == 0)
 947                         umem_panic("vmem_xalloc(): size == 0");
 948                 if (vmp->vm_source_alloc != NULL && nocross == 0 &&
 949                     minaddr == NULL && maxaddr == NULL) {
 950                         size_t asize = P2ROUNDUP(size + phase,
 951                             MAX(align, vmp->vm_source->vm_quantum));
 952                         if (asize < size) {             /* overflow */
 953                                 (void) mutex_unlock(&vmp->vm_lock);
 954                                 if (vmflag & VM_NOSLEEP)
 955                                         return (NULL);
 956
 957                                 umem_panic("vmem_xalloc(): "
 958                                     "overflow on VM_SLEEP allocation");
 959                         }
 960                         /*
 961                          * Determine how many segment structures we'll consume.
 962                          * The calculation must be presise because if we're
 963                          * here on behalf of vmem_populate(), we are taking
 964                          * segments from a very limited reserve.
 965                          */
 966                         resv = (size == asize) ?
 967                             VMEM_SEGS_PER_SPAN_CREATE +
 968                             VMEM_SEGS_PER_EXACT_ALLOC :
 969                             VMEM_SEGS_PER_ALLOC_MAX;
 970                         ASSERT(vmp->vm_nsegfree >= resv);
 971                         vmp->vm_nsegfree -= resv;       /* reserve our segs */
 972                         (void) mutex_unlock(&vmp->vm_lock);
 973                         vaddr = vmp->vm_source_alloc(vmp->vm_source, asize,
 974                             vmflag & VM_UMFLAGS);
 975                         (void) mutex_lock(&vmp->vm_lock);
 976                         vmp->vm_nsegfree += resv;       /* claim reservation */
 977                         if (vaddr != NULL) {
 978                                 vbest = vmem_span_create(vmp, vaddr, asize, 1);
 979                                 addr = P2PHASEUP(vbest->vs_start, align, phase);
 980                                 break;
 981                         }
 982                 }
 983                 (void) mutex_unlock(&vmp->vm_lock);
 984                 vmem_reap();
 985                 (void) mutex_lock(&vmp->vm_lock);
 986                 if (vmflag & VM_NOSLEEP)
 987                         break;
 988                 vmp->vm_kstat.vk_wait++;
 989                 (void) _cond_wait(&vmp->vm_cv, &vmp->vm_lock);
 990         }
 991         if (vbest != NULL) {
 992                 ASSERT(vbest->vs_type == VMEM_FREE);
 993                 ASSERT(vbest->vs_knext != vbest);
 994                 (void) vmem_seg_alloc(vmp, vbest, addr, size);
 995                 (void) mutex_unlock(&vmp->vm_lock);
 996                 ASSERT(P2PHASE(addr, align) == phase);
 997                 ASSERT(!P2CROSS(addr, addr + size - 1, nocross));
 998                 ASSERT(addr >= (uintptr_t)minaddr);
 999                 ASSERT(addr + size - 1 <= (uintptr_t)maxaddr - 1);
1000                 return ((void *)addr);
1001         }
1002         vmp->vm_kstat.vk_fail++;
1003         (void) mutex_unlock(&vmp->vm_lock);
1004         if (vmflag & VM_PANIC)
1005                 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): "
1006                     "cannot satisfy mandatory allocation",
1007                     (void *)vmp, size, align, phase, nocross,
1008                     minaddr, maxaddr, vmflag);
1009         return (NULL);
1010 }
1011
1012 /*
1013  * Free the segment [vaddr, vaddr + size), where vaddr was a constrained
1014  * allocation.  vmem_xalloc() and vmem_xfree() must always be paired because
1015  * both routines bypass the quantum caches.
1016  */
1017 void
1018 vmem_xfree(vmem_t *vmp, void *vaddr, size_t size)
1019 {
1020         vmem_seg_t *vsp, *vnext, *vprev;
1021
1022         (void) mutex_lock(&vmp->vm_lock);
1023
1024         vsp = vmem_hash_delete(vmp, (uintptr_t)vaddr, size);
1025         vsp->vs_end = P2ROUNDUP(vsp->vs_end, vmp->vm_quantum);
1026
1027         /*
1028          * Attempt to coalesce with the next segment.
1029          */
1030         vnext = vsp->vs_anext;
1031         if (vnext->vs_type == VMEM_FREE) {
1032                 ASSERT(vsp->vs_end == vnext->vs_start);
1033                 vmem_freelist_delete(vmp, vnext);
1034                 vsp->vs_end = vnext->vs_end;
1035                 vmem_seg_destroy(vmp, vnext);
1036         }
1037
1038         /*
1039          * Attempt to coalesce with the previous segment.
1040          */
1041         vprev = vsp->vs_aprev;
1042         if (vprev->vs_type == VMEM_FREE) {
1043                 ASSERT(vprev->vs_end == vsp->vs_start);
1044                 vmem_freelist_delete(vmp, vprev);
1045                 vprev->vs_end = vsp->vs_end;
1046                 vmem_seg_destroy(vmp, vsp);
1047                 vsp = vprev;
1048         }
1049
1050         /*
1051          * If the entire span is free, return it to the source.
1052          */
1053         if (vsp->vs_import && vmp->vm_source_free != NULL &&
1054             vsp->vs_aprev->vs_type == VMEM_SPAN &&
1055             vsp->vs_anext->vs_type == VMEM_SPAN) {
1056                 vaddr = (void *)vsp->vs_start;
1057                 size = VS_SIZE(vsp);
1058                 ASSERT(size == VS_SIZE(vsp->vs_aprev));
1059                 vmem_span_destroy(vmp, vsp);
1060                 (void) mutex_unlock(&vmp->vm_lock);
1061                 vmp->vm_source_free(vmp->vm_source, vaddr, size);
1062         } else {
1063                 vmem_freelist_insert(vmp, vsp);
1064                 (void) mutex_unlock(&vmp->vm_lock);
1065         }
1066 }
1067
1068 /*
1069  * Allocate size bytes from arena vmp.  Returns the allocated address
1070  * on success, NULL on failure.  vmflag specifies VM_SLEEP or VM_NOSLEEP,
1071  * and may also specify best-fit, first-fit, or next-fit allocation policy
1072  * instead of the default instant-fit policy.  VM_SLEEP allocations are
1073  * guaranteed to succeed.
1074  */
1075 void *
1076 vmem_alloc(vmem_t *vmp, size_t size, int vmflag)
1077 {
1078         vmem_seg_t *vsp;
1079         uintptr_t addr;
1080         int hb;
1081         int flist = 0;
1082         uint32_t mtbf;
1083
1084         if (size - 1 < vmp->vm_qcache_max) {
1085                 ASSERT(vmflag & VM_NOSLEEP);
1086                 return (_umem_cache_alloc(vmp->vm_qcache[(size - 1) >>
1087                     vmp->vm_qshift], UMEM_DEFAULT));
1088         }
1089
1090         if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 &&
1091             (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP)
1092                 return (NULL);
1093
1094         if (vmflag & VM_NEXTFIT)
1095                 return (vmem_nextfit_alloc(vmp, size, vmflag));
1096
1097         if (vmflag & (VM_BESTFIT | VM_FIRSTFIT))
1098                 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 0, 0,
1099                     NULL, NULL, vmflag));
1100
1101         /*
1102          * Unconstrained instant-fit allocation from the segment list.
1103          */
1104         (void) mutex_lock(&vmp->vm_lock);
1105
1106         if (vmp->vm_nsegfree >= VMEM_MINFREE || vmem_populate(vmp, vmflag)) {
1107                 if ((size & (size - 1)) == 0)
1108                         flist = lowbit(P2ALIGN(vmp->vm_freemap, size));
1109                 else if ((hb = highbit(size)) < VMEM_FREELISTS)
1110                         flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb));
1111         }
1112
1113         if (flist-- == 0) {
1114                 (void) mutex_unlock(&vmp->vm_lock);
1115                 return (vmem_xalloc(vmp, size, vmp->vm_quantum,
1116                     0, 0, NULL, NULL, vmflag));
1117         }
1118
1119         ASSERT(size <= (1UL << flist));
1120         vsp = vmp->vm_freelist[flist].vs_knext;
1121         addr = vsp->vs_start;
1122         (void) vmem_seg_alloc(vmp, vsp, addr, size);
1123         (void) mutex_unlock(&vmp->vm_lock);
1124         return ((void *)addr);
1125 }
1126
1127 /*
1128  * Free the segment [vaddr, vaddr + size).
1129  */
1130 void
1131 vmem_free(vmem_t *vmp, void *vaddr, size_t size)
1132 {
1133         if (size - 1 < vmp->vm_qcache_max)
1134                 _umem_cache_free(vmp->vm_qcache[(size - 1) >> vmp->vm_qshift],
1135                     vaddr);
1136         else
1137                 vmem_xfree(vmp, vaddr, size);
1138 }
1139
1140 /*
1141  * Determine whether arena vmp contains the segment [vaddr, vaddr + size).
1142  */
1143 int
1144 vmem_contains(vmem_t *vmp, void *vaddr, size_t size)
1145 {
1146         uintptr_t start = (uintptr_t)vaddr;
1147         uintptr_t end = start + size;
1148         vmem_seg_t *vsp;
1149         vmem_seg_t *seg0 = &vmp->vm_seg0;
1150
1151         (void) mutex_lock(&vmp->vm_lock);
1152         vmp->vm_kstat.vk_contains++;
1153         for (vsp = seg0->vs_knext; vsp != seg0; vsp = vsp->vs_knext) {
1154                 vmp->vm_kstat.vk_contains_search++;
1155                 ASSERT(vsp->vs_type == VMEM_SPAN);
1156                 if (start >= vsp->vs_start && end - 1 <= vsp->vs_end - 1)
1157                         break;
1158         }
1159         (void) mutex_unlock(&vmp->vm_lock);
1160         return (vsp != seg0);
1161 }
1162
1163 /*
1164  * Add the span [vaddr, vaddr + size) to arena vmp.
1165  */
1166 void *
1167 vmem_add(vmem_t *vmp, void *vaddr, size_t size, int vmflag)
1168 {
1169         if (vaddr == NULL || size == 0) {
1170                 umem_panic("vmem_add(%p, %p, %lu): bad arguments",
1171                     vmp, vaddr, size);
1172         }
1173
1174         ASSERT(!vmem_contains(vmp, vaddr, size));
1175
1176         (void) mutex_lock(&vmp->vm_lock);
1177         if (vmem_populate(vmp, vmflag))
1178                 (void) vmem_span_create(vmp, vaddr, size, 0);
1179         else
1180                 vaddr = NULL;
1181         (void) cond_broadcast(&vmp->vm_cv);
1182         (void) mutex_unlock(&vmp->vm_lock);
1183         return (vaddr);
1184 }
1185
1186 /*
1187  * Adds the address range [addr, endaddr) to arena vmp, by either:
1188  *   1. joining two existing spans, [x, addr), and [endaddr, y) (which
1189  *      are in that order) into a single [x, y) span,
1190  *   2. expanding an existing [x, addr) span to [x, endaddr),
1191  *   3. expanding an existing [endaddr, x) span to [addr, x), or
1192  *   4. creating a new [addr, endaddr) span.
1193  *
1194  * Called with vmp->vm_lock held, and a successful vmem_populate() completed.
1195  * Cannot fail.  Returns the new segment.
1196  *
1197  * NOTE:  this algorithm is linear-time in the number of spans, but is
1198  *      constant-time when you are extending the last (highest-addressed)
1199  *      span.
1200  */
1201 static vmem_seg_t *
1202 vmem_extend_unlocked(vmem_t *vmp, uintptr_t addr, uintptr_t endaddr)
1203 {
1204         vmem_seg_t *span;
1205         vmem_seg_t *vsp;
1206
1207         vmem_seg_t *end = &vmp->vm_seg0;
1208
1209         ASSERT(MUTEX_HELD(&vmp->vm_lock));
1210
1211         /*
1212          * the second "if" clause below relies on the direction of this search
1213          */
1214         for (span = end->vs_kprev; span != end; span = span->vs_kprev) {
1215                 if (span->vs_end == addr || span->vs_start == endaddr)
1216                         break;
1217         }
1218
1219         if (span == end)
1220                 return (vmem_span_create(vmp, (void *)addr, endaddr - addr, 0));
1221         if (span->vs_kprev->vs_end == addr && span->vs_start == endaddr) {
1222                 vmem_seg_t *prevspan = span->vs_kprev;
1223                 vmem_seg_t *nextseg = span->vs_anext;
1224                 vmem_seg_t *prevseg = span->vs_aprev;
1225
1226                 /*
1227                  * prevspan becomes the span marker for the full range
1228                  */
1229                 prevspan->vs_end = span->vs_end;
1230
1231                 /*
1232                  * Notionally, span becomes a free segment representing
1233                  * [addr, endaddr).
1234                  *
1235                  * However, if either of its neighbors are free, we coalesce
1236                  * by destroying span and changing the free segment.
1237                  */
1238                 if (prevseg->vs_type == VMEM_FREE &&
1239                     nextseg->vs_type == VMEM_FREE) {
1240                         /*
1241                          * coalesce both ways
1242                          */
1243                         ASSERT(prevseg->vs_end == addr &&
1244                             nextseg->vs_start == endaddr);
1245
1246                         vmem_freelist_delete(vmp, prevseg);
1247                         prevseg->vs_end = nextseg->vs_end;
1248
1249                         vmem_freelist_delete(vmp, nextseg);
1250                         VMEM_DELETE(span, k);
1251                         vmem_seg_destroy(vmp, nextseg);
1252                         vmem_seg_destroy(vmp, span);
1253
1254                         vsp = prevseg;
1255                 } else if (prevseg->vs_type == VMEM_FREE) {
1256                         /*
1257                          * coalesce left
1258                          */
1259                         ASSERT(prevseg->vs_end == addr);
1260
1261                         VMEM_DELETE(span, k);
1262                         vmem_seg_destroy(vmp, span);
1263
1264                         vmem_freelist_delete(vmp, prevseg);
1265                         prevseg->vs_end = endaddr;
1266
1267                         vsp = prevseg;
1268                 } else if (nextseg->vs_type == VMEM_FREE) {
1269                         /*
1270                          * coalesce right
1271                          */
1272                         ASSERT(nextseg->vs_start == endaddr);
1273
1274                         VMEM_DELETE(span, k);
1275                         vmem_seg_destroy(vmp, span);
1276
1277                         vmem_freelist_delete(vmp, nextseg);
1278                         nextseg->vs_start = addr;
1279
1280                         vsp = nextseg;
1281                 } else {
1282                         /*
1283                          * cannnot coalesce
1284                          */
1285                         VMEM_DELETE(span, k);
1286                         span->vs_start = addr;
1287                         span->vs_end = endaddr;
1288
1289                         vsp = span;
1290                 }
1291         } else if (span->vs_end == addr) {
1292                 vmem_seg_t *oldseg = span->vs_knext->vs_aprev;
1293                 span->vs_end = endaddr;
1294
1295                 ASSERT(oldseg->vs_type != VMEM_SPAN);
1296                 if (oldseg->vs_type == VMEM_FREE) {
1297                         ASSERT(oldseg->vs_end == addr);
1298                         vmem_freelist_delete(vmp, oldseg);
1299                         oldseg->vs_end = endaddr;
1300                         vsp = oldseg;
1301                 } else
1302                         vsp = vmem_seg_create(vmp, oldseg, addr, endaddr);
1303         } else {
1304                 vmem_seg_t *oldseg = span->vs_anext;
1305                 ASSERT(span->vs_start == endaddr);
1306                 span->vs_start = addr;
1307
1308                 ASSERT(oldseg->vs_type != VMEM_SPAN);
1309                 if (oldseg->vs_type == VMEM_FREE) {
1310                         ASSERT(oldseg->vs_start == endaddr);
1311                         vmem_freelist_delete(vmp, oldseg);
1312                         oldseg->vs_start = addr;
1313                         vsp = oldseg;
1314                 } else
1315                         vsp = vmem_seg_create(vmp, span, addr, endaddr);
1316         }
1317         vmem_freelist_insert(vmp, vsp);
1318         vmp->vm_kstat.vk_mem_total += (endaddr - addr);
1319         return (vsp);
1320 }
1321
1322 /*
1323  * Does some error checking, calls vmem_extend_unlocked to add
1324  * [vaddr, vaddr+size) to vmp, then allocates alloc bytes from the
1325  * newly merged segment.
1326  */
1327 void *
1328 _vmem_extend_alloc(vmem_t *vmp, void *vaddr, size_t size, size_t alloc,
1329     int vmflag)
1330 {
1331         uintptr_t addr = (uintptr_t)vaddr;
1332         uintptr_t endaddr = addr + size;
1333         vmem_seg_t *vsp;
1334
1335         ASSERT(vaddr != NULL && size != 0 && endaddr > addr);
1336         ASSERT(alloc <= size && alloc != 0);
1337         ASSERT(((addr | size | alloc) & (vmp->vm_quantum - 1)) == 0);
1338
1339         ASSERT(!vmem_contains(vmp, vaddr, size));
1340
1341         (void) mutex_lock(&vmp->vm_lock);
1342         if (!vmem_populate(vmp, vmflag)) {
1343                 (void) mutex_unlock(&vmp->vm_lock);
1344                 return (NULL);
1345         }
1346         /*
1347          * if there is a source, we can't mess with the spans
1348          */
1349         if (vmp->vm_source_alloc != NULL)
1350                 vsp = vmem_span_create(vmp, vaddr, size, 0);
1351         else
1352                 vsp = vmem_extend_unlocked(vmp, addr, endaddr);
1353
1354         ASSERT(VS_SIZE(vsp) >= alloc);
1355
1356         addr = vsp->vs_start;
1357         (void) vmem_seg_alloc(vmp, vsp, addr, alloc);
1358         vaddr = (void *)addr;
1359
1360         (void) cond_broadcast(&vmp->vm_cv);
1361         (void) mutex_unlock(&vmp->vm_lock);
1362
1363         return (vaddr);
1364 }
1365
1366 /*
1367  * Walk the vmp arena, applying func to each segment matching typemask.
1368  * If VMEM_REENTRANT is specified, the arena lock is dropped across each
1369  * call to func(); otherwise, it is held for the duration of vmem_walk()
1370  * to ensure a consistent snapshot.  Note that VMEM_REENTRANT callbacks
1371  * are *not* necessarily consistent, so they may only be used when a hint
1372  * is adequate.
1373  */
1374 void
1375 vmem_walk(vmem_t *vmp, int typemask,
1376         void (*func)(void *, void *, size_t), void *arg)
1377 {
1378         vmem_seg_t *vsp;
1379         vmem_seg_t *seg0 = &vmp->vm_seg0;
1380         vmem_seg_t walker;
1381
1382         if (typemask & VMEM_WALKER)
1383                 return;
1384
1385         bzero(&walker, sizeof (walker));
1386         walker.vs_type = VMEM_WALKER;
1387
1388         (void) mutex_lock(&vmp->vm_lock);
1389         VMEM_INSERT(seg0, &walker, a);
1390         for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) {
1391                 if (vsp->vs_type & typemask) {
1392                         void *start = (void *)vsp->vs_start;
1393                         size_t size = VS_SIZE(vsp);
1394                         if (typemask & VMEM_REENTRANT) {
1395                                 vmem_advance(vmp, &walker, vsp);
1396                                 (void) mutex_unlock(&vmp->vm_lock);
1397                                 func(arg, start, size);
1398                                 (void) mutex_lock(&vmp->vm_lock);
1399                                 vsp = &walker;
1400                         } else {
1401                                 func(arg, start, size);
1402                         }
1403                 }
1404         }
1405         vmem_advance(vmp, &walker, NULL);
1406         (void) mutex_unlock(&vmp->vm_lock);
1407 }
1408
1409 /*
1410  * Return the total amount of memory whose type matches typemask.  Thus:
1411  *
1412  *      typemask VMEM_ALLOC yields total memory allocated (in use).
1413  *      typemask VMEM_FREE yields total memory free (available).
1414  *      typemask (VMEM_ALLOC | VMEM_FREE) yields total arena size.
1415  */
1416 size_t
1417 vmem_size(vmem_t *vmp, int typemask)
1418 {
1419         uint64_t size = 0;
1420
1421         if (typemask & VMEM_ALLOC)
1422                 size += vmp->vm_kstat.vk_mem_inuse;
1423         if (typemask & VMEM_FREE)
1424                 size += vmp->vm_kstat.vk_mem_total -
1425                     vmp->vm_kstat.vk_mem_inuse;
1426         return ((size_t)size);
1427 }
1428
1429 /*
1430  * Create an arena called name whose initial span is [base, base + size).
1431  * The arena's natural unit of currency is quantum, so vmem_alloc()
1432  * guarantees quantum-aligned results.  The arena may import new spans
1433  * by invoking afunc() on source, and may return those spans by invoking
1434  * ffunc() on source.  To make small allocations fast and scalable,
1435  * the arena offers high-performance caching for each integer multiple
1436  * of quantum up to qcache_max.
1437  */
1438 vmem_t *
1439 vmem_create(const char *name, void *base, size_t size, size_t quantum,
1440         vmem_alloc_t *afunc, vmem_free_t *ffunc, vmem_t *source,
1441         size_t qcache_max, int vmflag)
1442 {
1443         int i;
1444         size_t nqcache;
1445         vmem_t *vmp, *cur, **vmpp;
1446         vmem_seg_t *vsp;
1447         vmem_freelist_t *vfp;
1448         uint32_t id = atomic_add_32_nv(&vmem_id, 1);
1449
1450         if (vmem_vmem_arena != NULL) {
1451                 vmp = vmem_alloc(vmem_vmem_arena, sizeof (vmem_t),
1452                     vmflag & VM_UMFLAGS);
1453         } else {
1454                 ASSERT(id <= VMEM_INITIAL);
1455                 vmp = &vmem0[id - 1];
1456         }
1457
1458         if (vmp == NULL)
1459                 return (NULL);
1460         bzero(vmp, sizeof (vmem_t));
1461
1462         (void) snprintf(vmp->vm_name, VMEM_NAMELEN, "%s", name);
1463         (void) mutex_init(&vmp->vm_lock, USYNC_THREAD, NULL);
1464         (void) cond_init(&vmp->vm_cv, USYNC_THREAD, NULL);
1465         vmp->vm_cflags = vmflag;
1466         vmflag &= VM_UMFLAGS;
1467
1468         vmp->vm_quantum = quantum;
1469         vmp->vm_qshift = highbit(quantum) - 1;
1470         nqcache = MIN(qcache_max >> vmp->vm_qshift, VMEM_NQCACHE_MAX);
1471
1472         for (i = 0; i <= VMEM_FREELISTS; i++) {
1473                 vfp = &vmp->vm_freelist[i];
1474                 vfp->vs_end = 1UL << i;
1475                 vfp->vs_knext = (vmem_seg_t *)(vfp + 1);
1476                 vfp->vs_kprev = (vmem_seg_t *)(vfp - 1);
1477         }
1478
1479         vmp->vm_freelist[0].vs_kprev = NULL;
1480         vmp->vm_freelist[VMEM_FREELISTS].vs_knext = NULL;
1481         vmp->vm_freelist[VMEM_FREELISTS].vs_end = 0;
1482         vmp->vm_hash_table = vmp->vm_hash0;
1483         vmp->vm_hash_mask = VMEM_HASH_INITIAL - 1;
1484         vmp->vm_hash_shift = highbit(vmp->vm_hash_mask);
1485
1486         vsp = &vmp->vm_seg0;
1487         vsp->vs_anext = vsp;
1488         vsp->vs_aprev = vsp;
1489         vsp->vs_knext = vsp;
1490         vsp->vs_kprev = vsp;
1491         vsp->vs_type = VMEM_SPAN;
1492
1493         vsp = &vmp->vm_rotor;
1494         vsp->vs_type = VMEM_ROTOR;
1495         VMEM_INSERT(&vmp->vm_seg0, vsp, a);
1496
1497         vmp->vm_id = id;
1498         if (source != NULL)
1499                 vmp->vm_kstat.vk_source_id = source->vm_id;
1500         vmp->vm_source = source;
1501         vmp->vm_source_alloc = afunc;
1502         vmp->vm_source_free = ffunc;
1503
1504         if (nqcache != 0) {
1505                 vmp->vm_qcache_max = nqcache << vmp->vm_qshift;
1506                 for (i = 0; i < nqcache; i++) {
1507                         char buf[VMEM_NAMELEN + 21];
1508                         (void) snprintf(buf, sizeof (buf), "%s_%lu",
1509                             vmp->vm_name, (long)((i + 1) * quantum));
1510                         vmp->vm_qcache[i] = umem_cache_create(buf,
1511                             (i + 1) * quantum, quantum, NULL, NULL, NULL,
1512                             NULL, vmp, UMC_QCACHE | UMC_NOTOUCH);
1513                         if (vmp->vm_qcache[i] == NULL) {
1514                                 vmp->vm_qcache_max = i * quantum;
1515                                 break;
1516                         }
1517                 }
1518         }
1519
1520         (void) mutex_lock(&vmem_list_lock);
1521         vmpp = &vmem_list;
1522         while ((cur = *vmpp) != NULL)
1523                 vmpp = &cur->vm_next;
1524         *vmpp = vmp;
1525         (void) mutex_unlock(&vmem_list_lock);
1526
1527         if (vmp->vm_cflags & VMC_POPULATOR) {
1528                 uint_t pop_id = atomic_add_32_nv(&vmem_populators, 1);
1529                 ASSERT(pop_id <= VMEM_INITIAL);
1530                 vmem_populator[pop_id - 1] = vmp;
1531                 (void) mutex_lock(&vmp->vm_lock);
1532                 (void) vmem_populate(vmp, vmflag | VM_PANIC);
1533                 (void) mutex_unlock(&vmp->vm_lock);
1534         }
1535
1536         if ((base || size) && vmem_add(vmp, base, size, vmflag) == NULL) {
1537                 vmem_destroy(vmp);
1538                 return (NULL);
1539         }
1540
1541         return (vmp);
1542 }
1543
1544 /*
1545  * Destroy arena vmp.
1546  */
1547 void
1548 vmem_destroy(vmem_t *vmp)
1549 {
1550         vmem_t *cur, **vmpp;
1551         vmem_seg_t *seg0 = &vmp->vm_seg0;
1552         vmem_seg_t *vsp;
1553         size_t leaked;
1554         int i;
1555
1556         (void) mutex_lock(&vmem_list_lock);
1557         vmpp = &vmem_list;
1558         while ((cur = *vmpp) != vmp)
1559                 vmpp = &cur->vm_next;
1560         *vmpp = vmp->vm_next;
1561         (void) mutex_unlock(&vmem_list_lock);
1562
1563         for (i = 0; i < VMEM_NQCACHE_MAX; i++)
1564                 if (vmp->vm_qcache[i])
1565                         umem_cache_destroy(vmp->vm_qcache[i]);
1566
1567         leaked = vmem_size(vmp, VMEM_ALLOC);
1568         if (leaked != 0)
1569                 umem_printf("vmem_destroy('%s'): leaked %lu bytes",
1570                     vmp->vm_name, leaked);
1571
1572         if (vmp->vm_hash_table != vmp->vm_hash0)
1573                 vmem_free(vmem_hash_arena, vmp->vm_hash_table,
1574                     (vmp->vm_hash_mask + 1) * sizeof (void *));
1575
1576         /*
1577          * Give back the segment structures for anything that's left in the
1578          * arena, e.g. the primary spans and their free segments.
1579          */
1580         VMEM_DELETE(&vmp->vm_rotor, a);
1581         for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext)
1582                 vmem_putseg_global(vsp);
1583
1584         while (vmp->vm_nsegfree > 0)
1585                 vmem_putseg_global(vmem_getseg(vmp));
1586
1587         (void) mutex_destroy(&vmp->vm_lock);
1588         (void) cond_destroy(&vmp->vm_cv);
1589         vmem_free(vmem_vmem_arena, vmp, sizeof (vmem_t));
1590 }
1591
1592 /*
1593  * Resize vmp's hash table to keep the average lookup depth near 1.0.
1594  */
1595 static void
1596 vmem_hash_rescale(vmem_t *vmp)
1597 {
1598         vmem_seg_t **old_table, **new_table, *vsp;
1599         size_t old_size, new_size, h, nseg;
1600
1601         nseg = (size_t)(vmp->vm_kstat.vk_alloc - vmp->vm_kstat.vk_free);
1602
1603         new_size = MAX(VMEM_HASH_INITIAL, 1 << (highbit(3 * nseg + 4) - 2));
1604         old_size = vmp->vm_hash_mask + 1;
1605
1606         if ((old_size >> 1) <= new_size && new_size <= (old_size << 1))
1607                 return;
1608
1609         new_table = vmem_alloc(vmem_hash_arena, new_size * sizeof (void *),
1610             VM_NOSLEEP);
1611         if (new_table == NULL)
1612                 return;
1613         bzero(new_table, new_size * sizeof (void *));
1614
1615         (void) mutex_lock(&vmp->vm_lock);
1616
1617         old_size = vmp->vm_hash_mask + 1;
1618         old_table = vmp->vm_hash_table;
1619
1620         vmp->vm_hash_mask = new_size - 1;
1621         vmp->vm_hash_table = new_table;
1622         vmp->vm_hash_shift = highbit(vmp->vm_hash_mask);
1623
1624         for (h = 0; h < old_size; h++) {
1625                 vsp = old_table[h];
1626                 while (vsp != NULL) {
1627                         uintptr_t addr = vsp->vs_start;
1628                         vmem_seg_t *next_vsp = vsp->vs_knext;
1629                         vmem_seg_t **hash_bucket = VMEM_HASH(vmp, addr);
1630                         vsp->vs_knext = *hash_bucket;
1631                         *hash_bucket = vsp;
1632                         vsp = next_vsp;
1633                 }
1634         }
1635
1636         (void) mutex_unlock(&vmp->vm_lock);
1637
1638         if (old_table != vmp->vm_hash0)
1639                 vmem_free(vmem_hash_arena, old_table,
1640                     old_size * sizeof (void *));
1641 }
1642
1643 /*
1644  * Perform periodic maintenance on all vmem arenas.
1645  */
1646 /*ARGSUSED*/
1647 void
1648 vmem_update(void *dummy)
1649 {
1650         vmem_t *vmp;
1651
1652         (void) mutex_lock(&vmem_list_lock);
1653         for (vmp = vmem_list; vmp != NULL; vmp = vmp->vm_next) {
1654                 /*
1655                  * If threads are waiting for resources, wake them up
1656                  * periodically so they can issue another vmem_reap()
1657                  * to reclaim resources cached by the slab allocator.
1658                  */
1659                 (void) cond_broadcast(&vmp->vm_cv);
1660
1661                 /*
1662                  * Rescale the hash table to keep the hash chains short.
1663                  */
1664                 vmem_hash_rescale(vmp);
1665         }
1666         (void) mutex_unlock(&vmem_list_lock);
1667 }
1668
1669 /*
1670  * If vmem_init is called again, we need to be able to reset the world.
1671  * That includes resetting the statics back to their original values.
1672  */
1673 void
1674 vmem_startup(void)
1675 {
1676 #ifdef UMEM_STANDALONE
1677         vmem_id = 0;
1678         vmem_populators = 0;
1679         vmem_segfree = NULL;
1680         vmem_list = NULL;
1681         vmem_internal_arena = NULL;
1682         vmem_seg_arena = NULL;
1683         vmem_hash_arena = NULL;
1684         vmem_vmem_arena = NULL;
1685         vmem_heap = NULL;
1686         vmem_heap_alloc = NULL;
1687         vmem_heap_free = NULL;
1688
1689         bzero(vmem0, sizeof (vmem0));
1690         bzero(vmem_populator, sizeof (vmem_populator));
1691         bzero(vmem_seg0, sizeof (vmem_seg0));
1692 #endif
1693 }
1694
1695 /*
1696  * Prepare vmem for use.
1697  */
1698 vmem_t *
1699 vmem_init(const char *parent_name, size_t parent_quantum,
1700     vmem_alloc_t *parent_alloc, vmem_free_t *parent_free,
1701     const char *heap_name, void *heap_start, size_t heap_size,
1702     size_t heap_quantum, vmem_alloc_t *heap_alloc, vmem_free_t *heap_free)
1703 {
1704         uint32_t id;
1705         int nseg = VMEM_SEG_INITIAL;
1706         vmem_t *parent, *heap;
1707
1708         ASSERT(vmem_internal_arena == NULL);
1709
1710         while (--nseg >= 0)
1711                 vmem_putseg_global(&vmem_seg0[nseg]);
1712
1713         if (parent_name != NULL) {
1714                 parent = vmem_create(parent_name,
1715                     heap_start, heap_size, parent_quantum,
1716                     NULL, NULL, NULL, 0,
1717                     VM_SLEEP | VMC_POPULATOR);
1718                 heap_start = NULL;
1719                 heap_size = 0;
1720         } else {
1721                 ASSERT(parent_alloc == NULL && parent_free == NULL);
1722                 parent = NULL;
1723         }
1724
1725         heap = vmem_create(heap_name,
1726             heap_start, heap_size, heap_quantum,
1727             parent_alloc, parent_free, parent, 0,
1728             VM_SLEEP | VMC_POPULATOR);
1729
1730         vmem_heap = heap;
1731         vmem_heap_alloc = heap_alloc;
1732         vmem_heap_free = heap_free;
1733
1734         vmem_internal_arena = vmem_create("vmem_internal",
1735             NULL, 0, heap_quantum,
1736             heap_alloc, heap_free, heap, 0,
1737             VM_SLEEP | VMC_POPULATOR);
1738
1739         vmem_seg_arena = vmem_create("vmem_seg",
1740             NULL, 0, heap_quantum,
1741             vmem_alloc, vmem_free, vmem_internal_arena, 0,
1742             VM_SLEEP | VMC_POPULATOR);
1743
1744         vmem_hash_arena = vmem_create("vmem_hash",
1745             NULL, 0, 8,
1746             vmem_alloc, vmem_free, vmem_internal_arena, 0,
1747             VM_SLEEP);
1748
1749         vmem_vmem_arena = vmem_create("vmem_vmem",
1750             vmem0, sizeof (vmem0), 1,
1751             vmem_alloc, vmem_free, vmem_internal_arena, 0,
1752             VM_SLEEP);
1753
1754         for (id = 0; id < vmem_id; id++)
1755                 (void) vmem_xalloc(vmem_vmem_arena, sizeof (vmem_t),
1756                     1, 0, 0, &vmem0[id], &vmem0[id + 1],
1757                     VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
1758
1759         return (heap);
1760 }
1761
1762 void
1763 vmem_no_debug(void)
1764 {
1765         /*
1766          * This size must be a multiple of the minimum required alignment,
1767          * since vmem_populate allocates them compactly.
1768          */
1769         vmem_seg_size = P2ROUNDUP(offsetof(vmem_seg_t, vs_thread),
1770             sizeof (hrtime_t));
1771 }
1772
1773 /*
1774  * Lockup and release, for fork1(2) handling.
1775  */
1776 void
1777 vmem_lockup(void)
1778 {
1779         vmem_t *cur;
1780
1781         (void) mutex_lock(&vmem_list_lock);
1782         (void) mutex_lock(&vmem_nosleep_lock.vmpl_mutex);
1783
1784         /*
1785          * Lock up and broadcast all arenas.
1786          */
1787         for (cur = vmem_list; cur != NULL; cur = cur->vm_next) {
1788                 (void) mutex_lock(&cur->vm_lock);
1789                 (void) cond_broadcast(&cur->vm_cv);
1790         }
1791
1792         (void) mutex_lock(&vmem_segfree_lock);
1793 }
1794
1795 void
1796 vmem_release(void)
1797 {
1798         vmem_t *cur;
1799
1800         (void) mutex_unlock(&vmem_nosleep_lock.vmpl_mutex);
1801
1802         for (cur = vmem_list; cur != NULL; cur = cur->vm_next)
1803                 (void) mutex_unlock(&cur->vm_lock);
1804
1805         (void) mutex_unlock(&vmem_segfree_lock);
1806         (void) mutex_unlock(&vmem_list_lock);
1807 }