zfs/lib/libumem/umem.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 /*
  27  * Portions Copyright 2006 OmniTI, Inc.
  28  */
  29
  30 /* #pragma ident        "@(#)umem.c     1.11    05/06/08 SMI" */
  31
  32 /*!
  33  * \mainpage Main Page
  34  *
  35  * \section README
  36  *
  37  * \include README
  38  *
  39  * \section Nuances
  40  *
  41  * There is a nuance in the behaviour of the umem port compared
  42  * with umem on Solaris.
  43  *
  44  * On Linux umem will not return memory back to the OS until umem fails
  45  * to allocate a chunk. On failure, umem_reap() will be called automatically,
  46  * to return memory to the OS. If your code is going to be running
  47  * for a long time on Linux and mixes calls to different memory allocators
  48  * (e.g.: malloc()) and umem, your code will need to call
  49  * umem_reap() periodically.
  50  *
  51  * This doesn't happen on Solaris, because malloc is replaced
  52  * with umem calls, meaning that umem_reap() is called automatically.
  53  *
  54  * \section References
  55  *
  56  * http://docs.sun.com/app/docs/doc/816-5173/6mbb8advq?a=view
  57  *
  58  * http://access1.sun.com/techarticles/libumem.html
  59  *
  60  * \section Overview
  61  *
  62  * \code
  63  * based on usr/src/uts/common/os/kmem.c r1.64 from 2001/12/18
  64  *
  65  * The slab allocator, as described in the following two papers:
  66  *
  67  *      Jeff Bonwick,
  68  *      The Slab Allocator: An Object-Caching Kernel Memory Allocator.
  69  *      Proceedings of the Summer 1994 Usenix Conference.
  70  *      Available as /shared/sac/PSARC/1994/028/materials/kmem.pdf.
  71  *
  72  *      Jeff Bonwick and Jonathan Adams,
  73  *      Magazines and vmem: Extending the Slab Allocator to Many CPUs and
  74  *      Arbitrary Resources.
  75  *      Proceedings of the 2001 Usenix Conference.
  76  *      Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf.
  77  *
  78  * 1. Overview
  79  * -----------
  80  * umem is very close to kmem in implementation.  There are four major
  81  * areas of divergence:
  82  *
  83  *      * Initialization
  84  *
  85  *      * CPU handling
  86  *
  87  *      * umem_update()
  88  *
  89  *      * KM_SLEEP v.s. UMEM_NOFAIL
  90  *
  91  *
  92  * 2. Initialization
  93  * -----------------
  94  * kmem is initialized early on in boot, and knows that no one will call
  95  * into it before it is ready.  umem does not have these luxuries. Instead,
  96  * initialization is divided into two phases:
  97  *
  98  *      * library initialization, and
  99  *
 100  *      * first use
 101  *
 102  * umem's full initialization happens at the time of the first allocation
 103  * request (via malloc() and friends, umem_alloc(), or umem_zalloc()),
 104  * or the first call to umem_cache_create().
 105  *
 106  * umem_free(), and umem_cache_alloc() do not require special handling,
 107  * since the only way to get valid arguments for them is to successfully
 108  * call a function from the first group.
 109  *
 110  * 2.1. Library Initialization: umem_startup()
 111  * -------------------------------------------
 112  * umem_startup() is libumem.so's .init section.  It calls pthread_atfork()
 113  * to install the handlers necessary for umem's Fork1-Safety.  Because of
 114  * race condition issues, all other pre-umem_init() initialization is done
 115  * statically (i.e. by the dynamic linker).
 116  *
 117  * For standalone use, umem_startup() returns everything to its initial
 118  * state.
 119  *
 120  * 2.2. First use: umem_init()
 121  * ------------------------------
 122  * The first time any memory allocation function is used, we have to
 123  * create the backing caches and vmem arenas which are needed for it.
 124  * umem_init() is the central point for that task.  When it completes,
 125  * umem_ready is either UMEM_READY (all set) or UMEM_READY_INIT_FAILED (unable
 126  * to initialize, probably due to lack of memory).
 127  *
 128  * There are four different paths from which umem_init() is called:
 129  *
 130  *      * from umem_alloc() or umem_zalloc(), with 0 < size < UMEM_MAXBUF,
 131  *
 132  *      * from umem_alloc() or umem_zalloc(), with size > UMEM_MAXBUF,
 133  *
 134  *      * from umem_cache_create(), and
 135  *
 136  *      * from memalign(), with align > UMEM_ALIGN.
 137  *
 138  * The last three just check if umem is initialized, and call umem_init()
 139  * if it is not.  For performance reasons, the first case is more complicated.
 140  *
 141  * 2.2.1. umem_alloc()/umem_zalloc(), with 0 < size < UMEM_MAXBUF
 142  * -----------------------------------------------------------------
 143  * In this case, umem_cache_alloc(&umem_null_cache, ...) is called.
 144  * There is special case code in which causes any allocation on
 145  * &umem_null_cache to fail by returning (NULL), regardless of the
 146  * flags argument.
 147  *
 148  * So umem_cache_alloc() returns NULL, and umem_alloc()/umem_zalloc() call
 149  * umem_alloc_retry().  umem_alloc_retry() sees that the allocation
 150  * was agains &umem_null_cache, and calls umem_init().
 151  *
 152  * If initialization is successful, umem_alloc_retry() returns 1, which
 153  * causes umem_alloc()/umem_zalloc() to start over, which causes it to load
 154  * the (now valid) cache pointer from umem_alloc_table.
 155  *
 156  * 2.2.2. Dealing with race conditions
 157  * -----------------------------------
 158  * There are a couple race conditions resulting from the initialization
 159  * code that we have to guard against:
 160  *
 161  *      * In umem_cache_create(), there is a special UMC_INTERNAL cflag
 162  *      that is passed for caches created during initialization.  It
 163  *      is illegal for a user to try to create a UMC_INTERNAL cache.
 164  *      This allows initialization to proceed, but any other
 165  *      umem_cache_create()s will block by calling umem_init().
 166  *
 167  *      * Since umem_null_cache has a 1-element cache_cpu, it's cache_cpu_mask
 168  *      is always zero.  umem_cache_alloc uses cp->cache_cpu_mask to
 169  *      mask the cpu number.  This prevents a race between grabbing a
 170  *      cache pointer out of umem_alloc_table and growing the cpu array.
 171  *
 172  *
 173  * 3. CPU handling
 174  * ---------------
 175  * kmem uses the CPU's sequence number to determine which "cpu cache" to
 176  * use for an allocation.  Currently, there is no way to get the sequence
 177  * number in userspace.
 178  *
 179  * umem keeps track of cpu information in umem_cpus, an array of umem_max_ncpus
 180  * umem_cpu_t structures.  CURCPU() is a a "hint" function, which we then mask
 181  * with either umem_cpu_mask or cp->cache_cpu_mask to find the actual "cpu" id.
 182  * The mechanics of this is all in the CPU(mask) macro.
 183  *
 184  * Currently, umem uses _lwp_self() as its hint.
 185  *
 186  *
 187  * 4. The update thread
 188  * --------------------
 189  * kmem uses a task queue, kmem_taskq, to do periodic maintenance on
 190  * every kmem cache.  vmem has a periodic timeout for hash table resizing.
 191  * The kmem_taskq also provides a separate context for kmem_cache_reap()'s
 192  * to be done in, avoiding issues of the context of kmem_reap() callers.
 193  *
 194  * Instead, umem has the concept of "updates", which are asynchronous requests
 195  * for work attached to single caches.  All caches with pending work are
 196  * on a doubly linked list rooted at the umem_null_cache.  All update state
 197  * is protected by the umem_update_lock mutex, and the umem_update_cv is used
 198  * for notification between threads.
 199  *
 200  * 4.1. Cache states with regards to updates
 201  * -----------------------------------------
 202  * A given cache is in one of three states:
 203  *
 204  * Inactive             cache_uflags is zero, cache_u{next,prev} are NULL
 205  *
 206  * Work Requested       cache_uflags is non-zero (but UMU_ACTIVE is not set),
 207  *                      cache_u{next,prev} link the cache onto the global
 208  *                      update list
 209  *
 210  * Active               cache_uflags has UMU_ACTIVE set, cache_u{next,prev}
 211  *                      are NULL, and either umem_update_thr or
 212  *                      umem_st_update_thr are actively doing work on the
 213  *                      cache.
 214  *
 215  * An update can be added to any cache in any state -- if the cache is
 216  * Inactive, it transitions to being Work Requested.  If the cache is
 217  * Active, the worker will notice the new update and act on it before
 218  * transitioning the cache to the Inactive state.
 219  *
 220  * If a cache is in the Active state, UMU_NOTIFY can be set, which asks
 221  * the worker to broadcast the umem_update_cv when it has finished.
 222  *
 223  * 4.2. Update interface
 224  * ---------------------
 225  * umem_add_update() adds an update to a particular cache.
 226  * umem_updateall() adds an update to all caches.
 227  * umem_remove_updates() returns a cache to the Inactive state.
 228  *
 229  * umem_process_updates() process all caches in the Work Requested state.
 230  *
 231  * 4.3. Reaping
 232  * ------------
 233  * When umem_reap() is called (at the time of heap growth), it schedule
 234  * UMU_REAP updates on every cache.  It then checks to see if the update
 235  * thread exists (umem_update_thr != 0).  If it is, it broadcasts
 236  * the umem_update_cv to wake the update thread up, and returns.
 237  *
 238  * If the update thread does not exist (umem_update_thr == 0), and the
 239  * program currently has multiple threads, umem_reap() attempts to create
 240  * a new update thread.
 241  *
 242  * If the process is not multithreaded, or the creation fails, umem_reap()
 243  * calls umem_st_update() to do an inline update.
 244  *
 245  * 4.4. The update thread
 246  * ----------------------
 247  * The update thread spends most of its time in cond_timedwait() on the
 248  * umem_update_cv.  It wakes up under two conditions:
 249  *
 250  *      * The timedwait times out, in which case it needs to run a global
 251  *      update, or
 252  *
 253  *      * someone cond_broadcast(3THR)s the umem_update_cv, in which case
 254  *      it needs to check if there are any caches in the Work Requested
 255  *      state.
 256  *
 257  * When it is time for another global update, umem calls umem_cache_update()
 258  * on every cache, then calls vmem_update(), which tunes the vmem structures.
 259  * umem_cache_update() can request further work using umem_add_update().
 260  *
 261  * After any work from the global update completes, the update timer is
 262  * reset to umem_reap_interval seconds in the future.  This makes the
 263  * updates self-throttling.
 264  *
 265  * Reaps are similarly self-throttling.  After a UMU_REAP update has
 266  * been scheduled on all caches, umem_reap() sets a flag and wakes up the
 267  * update thread.  The update thread notices the flag, and resets the
 268  * reap state.
 269  *
 270  * 4.5. Inline updates
 271  * -------------------
 272  * If the update thread is not running, umem_st_update() is used instead.  It
 273  * immediately does a global update (as above), then calls
 274  * umem_process_updates() to process both the reaps that umem_reap() added and
 275  * any work generated by the global update.  Afterwards, it resets the reap
 276  * state.
 277  *
 278  * While the umem_st_update() is running, umem_st_update_thr holds the thread
 279  * id of the thread performing the update.
 280  *
 281  * 4.6. Updates and fork1()
 282  * ------------------------
 283  * umem has fork1() pre- and post-handlers which lock up (and release) every
 284  * mutex in every cache.  They also lock up the umem_update_lock.  Since
 285  * fork1() only copies over a single lwp, other threads (including the update
 286  * thread) could have been actively using a cache in the parent.  This
 287  * can lead to inconsistencies in the child process.
 288  *
 289  * Because we locked all of the mutexes, the only possible inconsistancies are:
 290  *
 291  *      * a umem_cache_alloc() could leak its buffer.
 292  *
 293  *      * a caller of umem_depot_alloc() could leak a magazine, and all the
 294  *      buffers contained in it.
 295  *
 296  *      * a cache could be in the Active update state.  In the child, there
 297  *      would be no thread actually working on it.
 298  *
 299  *      * a umem_hash_rescale() could leak the new hash table.
 300  *
 301  *      * a umem_magazine_resize() could be in progress.
 302  *
 303  *      * a umem_reap() could be in progress.
 304  *
 305  * The memory leaks we can't do anything about.  umem_release_child() resets
 306  * the update state, moves any caches in the Active state to the Work Requested
 307  * state.  This might cause some updates to be re-run, but UMU_REAP and
 308  * UMU_HASH_RESCALE are effectively idempotent, and the worst that can
 309  * happen from umem_magazine_resize() is resizing the magazine twice in close
 310  * succession.
 311  *
 312  * Much of the cleanup in umem_release_child() is skipped if
 313  * umem_st_update_thr == thr_self().  This is so that applications which call
 314  * fork1() from a cache callback does not break.  Needless to say, any such
 315  * application is tremendously broken.
 316  *
 317  *
 318  * 5. KM_SLEEP v.s. UMEM_NOFAIL
 319  * ----------------------------
 320  * Allocations against kmem and vmem have two basic modes:  SLEEP and
 321  * NOSLEEP.  A sleeping allocation is will go to sleep (waiting for
 322  * more memory) instead of failing (returning NULL).
 323  *
 324  * SLEEP allocations presume an extremely multithreaded model, with
 325  * a lot of allocation and deallocation activity.  umem cannot presume
 326  * that its clients have any particular type of behavior.  Instead,
 327  * it provides two types of allocations:
 328  *
 329  *      * UMEM_DEFAULT, equivalent to KM_NOSLEEP (i.e. return NULL on
 330  *      failure)
 331  *
 332  *      * UMEM_NOFAIL, which, on failure, calls an optional callback
 333  *      (registered with umem_nofail_callback()).
 334  *
 335  * The callback is invoked with no locks held, and can do an arbitrary
 336  * amount of work.  It then has a choice between:
 337  *
 338  *      * Returning UMEM_CALLBACK_RETRY, which will cause the allocation
 339  *      to be restarted.
 340  *
 341  *      * Returning UMEM_CALLBACK_EXIT(status), which will cause exit(2)
 342  *      to be invoked with status.  If multiple threads attempt to do
 343  *      this simultaneously, only one will call exit(2).
 344  *
 345  *      * Doing some kind of non-local exit (thr_exit(3thr), longjmp(3C),
 346  *      etc.)
 347  *
 348  * The default callback returns UMEM_CALLBACK_EXIT(255).
 349  *
 350  * To have these callbacks without risk of state corruption (in the case of
 351  * a non-local exit), we have to ensure that the callbacks get invoked
 352  * close to the original allocation, with no inconsistent state or held
 353  * locks.  The following steps are taken:
 354  *
 355  *      * All invocations of vmem are VM_NOSLEEP.
 356  *
 357  *      * All constructor callbacks (which can themselves to allocations)
 358  *      are passed UMEM_DEFAULT as their required allocation argument.  This
 359  *      way, the constructor will fail, allowing the highest-level allocation
 360  *      invoke the nofail callback.
 361  *
 362  *      If a constructor callback _does_ do a UMEM_NOFAIL allocation, and
 363  *      the nofail callback does a non-local exit, we will leak the
 364  *      partially-constructed buffer.
 365  * \endcode
 366  */
 367
 368 #include "config.h"
 369 /* #include "mtlib.h" */
 370 #include <umem_impl.h>
 371 #include <sys/vmem_impl_user.h>
 372 #include "umem_base.h"
 373 #include "vmem_base.h"
 374
 375 #if HAVE_SYS_PROCESSOR_H
 376 #include <sys/processor.h>
 377 #endif
 378 #if HAVE_SYS_SYSMACROS_H
 379 #include <sys/sysmacros.h>
 380 #endif
 381
 382 #if HAVE_ALLOCA_H
 383 #include <alloca.h>
 384 #endif
 385 #include <errno.h>
 386 #include <limits.h>
 387 #include <stdio.h>
 388 #include <stdlib.h>
 389 #include <string.h>
 390 #if HAVE_STRINGS_H
 391 #include <strings.h>
 392 #endif
 393 #include <signal.h>
 394 #if HAVE_UNISTD_H
 395 #include <unistd.h>
 396 #endif
 397 #if HAVE_ATOMIC_H
 398 #include <atomic.h>
 399 #endif
 400
 401 #include "misc.h"
 402
 403 #define UMEM_VMFLAGS(umflag)    (VM_NOSLEEP)
 404
 405 size_t pagesize;
 406
 407 /*
 408  * The default set of caches to back umem_alloc().
 409  * These sizes should be reevaluated periodically.
 410  *
 411  * We want allocations that are multiples of the coherency granularity
 412  * (64 bytes) to be satisfied from a cache which is a multiple of 64
 413  * bytes, so that it will be 64-byte aligned.  For all multiples of 64,
 414  * the next kmem_cache_size greater than or equal to it must be a
 415  * multiple of 64.
 416  */
 417 static const int umem_alloc_sizes[] = {
 418 #ifdef _LP64
 419         1 * 8,
 420         1 * 16,
 421         2 * 16,
 422         3 * 16,
 423 #else
 424         1 * 8,
 425         2 * 8,
 426         3 * 8,
 427         4 * 8,          5 * 8,          6 * 8,          7 * 8,
 428 #endif
 429         4 * 16,         5 * 16,         6 * 16,         7 * 16,
 430         4 * 32,         5 * 32,         6 * 32,         7 * 32,
 431         4 * 64,         5 * 64,         6 * 64,         7 * 64,
 432         4 * 128,        5 * 128,        6 * 128,        7 * 128,
 433         P2ALIGN(8192 / 7, 64),
 434         P2ALIGN(8192 / 6, 64),
 435         P2ALIGN(8192 / 5, 64),
 436         P2ALIGN(8192 / 4, 64),
 437         P2ALIGN(8192 / 3, 64),
 438         P2ALIGN(8192 / 2, 64),
 439         P2ALIGN(8192 / 1, 64),
 440         4096 * 3,
 441         8192 * 2,
 442 };
 443 #define NUM_ALLOC_SIZES (sizeof (umem_alloc_sizes) / sizeof (*umem_alloc_sizes))
 444
 445 #define UMEM_MAXBUF     16384
 446
 447 static umem_magtype_t umem_magtype[] = {
 448         { 1,    8,      3200,   65536   },
 449         { 3,    16,     256,    32768   },
 450         { 7,    32,     64,     16384   },
 451         { 15,   64,     0,      8192    },
 452         { 31,   64,     0,      4096    },
 453         { 47,   64,     0,      2048    },
 454         { 63,   64,     0,      1024    },
 455         { 95,   64,     0,      512     },
 456         { 143,  64,     0,      0       },
 457 };
 458
 459 /*
 460  * umem tunables
 461  */
 462 uint32_t umem_max_ncpus;        /* # of CPU caches. */
 463
 464 uint32_t umem_stack_depth = 15; /* # stack frames in a bufctl_audit */
 465 uint32_t umem_reap_interval = 10; /* max reaping rate (seconds) */
 466 uint_t umem_depot_contention = 2; /* max failed trylocks per real interval */
 467 uint_t umem_abort = 1;          /* whether to abort on error */
 468 uint_t umem_output = 0;         /* whether to write to standard error */
 469 uint_t umem_logging = 0;        /* umem_log_enter() override */
 470 uint32_t umem_mtbf = 0;         /* mean time between failures [default: off] */
 471 size_t umem_transaction_log_size; /* size of transaction log */
 472 size_t umem_content_log_size;   /* size of content log */
 473 size_t umem_failure_log_size;   /* failure log [4 pages per CPU] */
 474 size_t umem_slab_log_size;      /* slab create log [4 pages per CPU] */
 475 size_t umem_content_maxsave = 256; /* UMF_CONTENTS max bytes to log */
 476 size_t umem_lite_minsize = 0;   /* minimum buffer size for UMF_LITE */
 477 size_t umem_lite_maxalign = 1024; /* maximum buffer alignment for UMF_LITE */
 478 size_t umem_maxverify;          /* maximum bytes to inspect in debug routines */
 479 size_t umem_minfirewall;        /* hardware-enforced redzone threshold */
 480
 481 uint_t umem_flags = 0;
 482
 483 mutex_t                 umem_init_lock = DEFAULTMUTEX;          /* locks initialization */
 484 cond_t                  umem_init_cv = DEFAULTCV;               /* initialization CV */
 485 thread_t                umem_init_thr;          /* thread initializing */
 486 int                     umem_init_env_ready;    /* environ pre-initted */
 487 int                     umem_ready = UMEM_READY_STARTUP;
 488
 489 static umem_nofail_callback_t *nofail_callback;
 490 static mutex_t          umem_nofail_exit_lock = DEFAULTMUTEX;
 491 static thread_t         umem_nofail_exit_thr;
 492
 493 static umem_cache_t     *umem_slab_cache;
 494 static umem_cache_t     *umem_bufctl_cache;
 495 static umem_cache_t     *umem_bufctl_audit_cache;
 496
 497 mutex_t                 umem_flags_lock = DEFAULTMUTEX;
 498
 499 static vmem_t           *heap_arena;
 500 static vmem_alloc_t     *heap_alloc;
 501 static vmem_free_t      *heap_free;
 502
 503 static vmem_t           *umem_internal_arena;
 504 static vmem_t           *umem_cache_arena;
 505 static vmem_t           *umem_hash_arena;
 506 static vmem_t           *umem_log_arena;
 507 static vmem_t           *umem_oversize_arena;
 508 static vmem_t           *umem_va_arena;
 509 static vmem_t           *umem_default_arena;
 510 static vmem_t           *umem_firewall_va_arena;
 511 static vmem_t           *umem_firewall_arena;
 512
 513 vmem_t                  *umem_memalign_arena;
 514
 515 umem_log_header_t *umem_transaction_log;
 516 umem_log_header_t *umem_content_log;
 517 umem_log_header_t *umem_failure_log;
 518 umem_log_header_t *umem_slab_log;
 519
 520 extern thread_t _thr_self(void);
 521 #if defined(__MACH__) || defined(__FreeBSD__)
 522 # define CPUHINT()      ((int)(_thr_self()))
 523 #endif
 524
 525 #ifndef CPUHINT
 526 #define CPUHINT()               (_thr_self())
 527 #endif
 528
 529 #define CPUHINT_MAX()           INT_MAX
 530
 531 #define CPU(mask)               (umem_cpus + (CPUHINT() & (mask)))
 532 static umem_cpu_t umem_startup_cpu = {  /* initial, single, cpu */
 533         UMEM_CACHE_SIZE(0),
 534         0
 535 };
 536
 537 static uint32_t umem_cpu_mask = 0;                      /* global cpu mask */
 538 static umem_cpu_t *umem_cpus = &umem_startup_cpu;       /* cpu list */
 539
 540 volatile uint32_t umem_reaping;
 541
 542 thread_t                umem_update_thr;
 543 struct timeval          umem_update_next;       /* timeofday of next update */
 544 volatile thread_t       umem_st_update_thr;     /* only used when single-thd */
 545
 546 #define IN_UPDATE()     (thr_self() == umem_update_thr || \
 547                             thr_self() == umem_st_update_thr)
 548 #define IN_REAP()       IN_UPDATE()
 549
 550 mutex_t                 umem_update_lock = DEFAULTMUTEX;        /* cache_u{next,prev,flags} */
 551 cond_t                  umem_update_cv = DEFAULTCV;
 552
 553 volatile hrtime_t umem_reap_next;       /* min hrtime of next reap */
 554
 555 mutex_t                 umem_cache_lock = DEFAULTMUTEX; /* inter-cache linkage only */
 556
 557 #ifdef UMEM_STANDALONE
 558 umem_cache_t            umem_null_cache;
 559 static const umem_cache_t umem_null_cache_template = {
 560 #else
 561 umem_cache_t            umem_null_cache = {
 562 #endif
 563         0, 0, 0, 0, 0,
 564         0, 0,
 565         0, 0,
 566         0, 0,
 567         "invalid_cache",
 568         0, 0,
 569         NULL, NULL, NULL, NULL,
 570         NULL,
 571         0, 0, 0, 0,
 572         &umem_null_cache, &umem_null_cache,
 573         &umem_null_cache, &umem_null_cache,
 574         0,
 575         DEFAULTMUTEX,                           /* start of slab layer */
 576         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 577         &umem_null_cache.cache_nullslab,
 578         {
 579                 &umem_null_cache,
 580                 NULL,
 581                 &umem_null_cache.cache_nullslab,
 582                 &umem_null_cache.cache_nullslab,
 583                 NULL,
 584                 -1,
 585                 0
 586         },
 587         NULL,
 588         NULL,
 589         DEFAULTMUTEX,                           /* start of depot layer */
 590         NULL, {
 591                 NULL, 0, 0, 0, 0
 592         }, {
 593                 NULL, 0, 0, 0, 0
 594         }, {
 595                 {
 596                         DEFAULTMUTEX,           /* start of CPU cache */
 597                         0, 0, NULL, NULL, -1, -1, 0
 598                 }
 599         }
 600 };
 601
 602 #define ALLOC_TABLE_4 \
 603         &umem_null_cache, &umem_null_cache, &umem_null_cache, &umem_null_cache
 604
 605 #define ALLOC_TABLE_64 \
 606         ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, \
 607         ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, \
 608         ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, \
 609         ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4
 610
 611 #define ALLOC_TABLE_1024 \
 612         ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, \
 613         ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, \
 614         ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, \
 615         ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64
 616
 617 static umem_cache_t *umem_alloc_table[UMEM_MAXBUF >> UMEM_ALIGN_SHIFT] = {
 618         ALLOC_TABLE_1024,
 619         ALLOC_TABLE_1024
 620 };
 621
 622
 623 /* Used to constrain audit-log stack traces */
 624 caddr_t                 umem_min_stack;
 625 caddr_t                 umem_max_stack;
 626
 627
 628 /*
 629  * we use the _ versions, since we don't want to be cancelled.
 630  * Actually, this is automatically taken care of by including "mtlib.h".
 631  */
 632 extern int _cond_wait(cond_t *cv, mutex_t *mutex);
 633
 634 #define UMERR_MODIFIED  0       /* buffer modified while on freelist */
 635 #define UMERR_REDZONE   1       /* redzone violation (write past end of buf) */
 636 #define UMERR_DUPFREE   2       /* freed a buffer twice */
 637 #define UMERR_BADADDR   3       /* freed a bad (unallocated) address */
 638 #define UMERR_BADBUFTAG 4       /* buftag corrupted */
 639 #define UMERR_BADBUFCTL 5       /* bufctl corrupted */
 640 #define UMERR_BADCACHE  6       /* freed a buffer to the wrong cache */
 641 #define UMERR_BADSIZE   7       /* alloc size != free size */
 642 #define UMERR_BADBASE   8       /* buffer base address wrong */
 643
 644 struct {
 645         hrtime_t        ump_timestamp;  /* timestamp of error */
 646         int             ump_error;      /* type of umem error (UMERR_*) */
 647         void            *ump_buffer;    /* buffer that induced abort */
 648         void            *ump_realbuf;   /* real start address for buffer */
 649         umem_cache_t    *ump_cache;     /* buffer's cache according to client */
 650         umem_cache_t    *ump_realcache; /* actual cache containing buffer */
 651         umem_slab_t     *ump_slab;      /* slab accoring to umem_findslab() */
 652         umem_bufctl_t   *ump_bufctl;    /* bufctl */
 653 } umem_abort_info;
 654
 655 static void
 656 copy_pattern(uint64_t pattern, void *buf_arg, size_t size)
 657 {
 658         uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
 659         uint64_t *buf = buf_arg;
 660
 661         while (buf < bufend)
 662                 *buf++ = pattern;
 663 }
 664
 665 static void *
 666 verify_pattern(uint64_t pattern, void *buf_arg, size_t size)
 667 {
 668         uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
 669         uint64_t *buf;
 670
 671         for (buf = buf_arg; buf < bufend; buf++)
 672                 if (*buf != pattern)
 673                         return (buf);
 674         return (NULL);
 675 }
 676
 677 static void *
 678 verify_and_copy_pattern(uint64_t old, uint64_t new, void *buf_arg, size_t size)
 679 {
 680         uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
 681         uint64_t *buf;
 682
 683         for (buf = buf_arg; buf < bufend; buf++) {
 684                 if (*buf != old) {
 685                         copy_pattern(old, buf_arg,
 686                             (char *)buf - (char *)buf_arg);
 687                         return (buf);
 688                 }
 689                 *buf = new;
 690         }
 691
 692         return (NULL);
 693 }
 694
 695 void
 696 umem_cache_applyall(void (*func)(umem_cache_t *))
 697 {
 698         umem_cache_t *cp;
 699
 700         (void) mutex_lock(&umem_cache_lock);
 701         for (cp = umem_null_cache.cache_next; cp != &umem_null_cache;
 702             cp = cp->cache_next)
 703                 func(cp);
 704         (void) mutex_unlock(&umem_cache_lock);
 705 }
 706
 707 static void
 708 umem_add_update_unlocked(umem_cache_t *cp, int flags)
 709 {
 710         umem_cache_t *cnext, *cprev;
 711
 712         flags &= ~UMU_ACTIVE;
 713
 714         if (!flags)
 715                 return;
 716
 717         if (cp->cache_uflags & UMU_ACTIVE) {
 718                 cp->cache_uflags |= flags;
 719         } else {
 720                 if (cp->cache_unext != NULL) {
 721                         ASSERT(cp->cache_uflags != 0);
 722                         cp->cache_uflags |= flags;
 723                 } else {
 724                         ASSERT(cp->cache_uflags == 0);
 725                         cp->cache_uflags = flags;
 726                         cp->cache_unext = cnext = &umem_null_cache;
 727                         cp->cache_uprev = cprev = umem_null_cache.cache_uprev;
 728                         cnext->cache_uprev = cp;
 729                         cprev->cache_unext = cp;
 730                 }
 731         }
 732 }
 733
 734 static void
 735 umem_add_update(umem_cache_t *cp, int flags)
 736 {
 737         (void) mutex_lock(&umem_update_lock);
 738
 739         umem_add_update_unlocked(cp, flags);
 740
 741         if (!IN_UPDATE())
 742                 (void) cond_broadcast(&umem_update_cv);
 743
 744         (void) mutex_unlock(&umem_update_lock);
 745 }
 746
 747 /*
 748  * Remove a cache from the update list, waiting for any in-progress work to
 749  * complete first.
 750  */
 751 static void
 752 umem_remove_updates(umem_cache_t *cp)
 753 {
 754         (void) mutex_lock(&umem_update_lock);
 755
 756         /*
 757          * Get it out of the active state
 758          */
 759         while (cp->cache_uflags & UMU_ACTIVE) {
 760                 ASSERT(cp->cache_unext == NULL);
 761
 762                 cp->cache_uflags |= UMU_NOTIFY;
 763
 764                 /*
 765                  * Make sure the update state is sane, before we wait
 766                  */
 767                 ASSERT(umem_update_thr != 0 || umem_st_update_thr != 0);
 768                 ASSERT(umem_update_thr != thr_self() &&
 769                     umem_st_update_thr != thr_self());
 770
 771                 (void) _cond_wait(&umem_update_cv, &umem_update_lock);
 772         }
 773         /*
 774          * Get it out of the Work Requested state
 775          */
 776         if (cp->cache_unext != NULL) {
 777                 cp->cache_uprev->cache_unext = cp->cache_unext;
 778                 cp->cache_unext->cache_uprev = cp->cache_uprev;
 779                 cp->cache_uprev = cp->cache_unext = NULL;
 780                 cp->cache_uflags = 0;
 781         }
 782         /*
 783          * Make sure it is in the Inactive state
 784          */
 785         ASSERT(cp->cache_unext == NULL && cp->cache_uflags == 0);
 786         (void) mutex_unlock(&umem_update_lock);
 787 }
 788
 789 static void
 790 umem_updateall(int flags)
 791 {
 792         umem_cache_t *cp;
 793
 794         /*
 795          * NOTE:  To prevent deadlock, umem_cache_lock is always acquired first.
 796          *
 797          * (umem_add_update is called from things run via umem_cache_applyall)
 798          */
 799         (void) mutex_lock(&umem_cache_lock);
 800         (void) mutex_lock(&umem_update_lock);
 801
 802         for (cp = umem_null_cache.cache_next; cp != &umem_null_cache;
 803             cp = cp->cache_next)
 804                 umem_add_update_unlocked(cp, flags);
 805
 806         if (!IN_UPDATE())
 807                 (void) cond_broadcast(&umem_update_cv);
 808
 809         (void) mutex_unlock(&umem_update_lock);
 810         (void) mutex_unlock(&umem_cache_lock);
 811 }
 812
 813 /*
 814  * Debugging support.  Given a buffer address, find its slab.
 815  */
 816 static umem_slab_t *
 817 umem_findslab(umem_cache_t *cp, void *buf)
 818 {
 819         umem_slab_t *sp;
 820
 821         (void) mutex_lock(&cp->cache_lock);
 822         for (sp = cp->cache_nullslab.slab_next;
 823             sp != &cp->cache_nullslab; sp = sp->slab_next) {
 824                 if (UMEM_SLAB_MEMBER(sp, buf)) {
 825                         (void) mutex_unlock(&cp->cache_lock);
 826                         return (sp);
 827                 }
 828         }
 829         (void) mutex_unlock(&cp->cache_lock);
 830
 831         return (NULL);
 832 }
 833
 834 static void
 835 umem_error(int error, umem_cache_t *cparg, void *bufarg)
 836 {
 837         umem_buftag_t *btp = NULL;
 838         umem_bufctl_t *bcp = NULL;
 839         umem_cache_t *cp = cparg;
 840         umem_slab_t *sp;
 841         uint64_t *off;
 842         void *buf = bufarg;
 843
 844         int old_logging = umem_logging;
 845
 846         umem_logging = 0;       /* stop logging when a bad thing happens */
 847
 848         umem_abort_info.ump_timestamp = gethrtime();
 849
 850         sp = umem_findslab(cp, buf);
 851         if (sp == NULL) {
 852                 for (cp = umem_null_cache.cache_prev; cp != &umem_null_cache;
 853                     cp = cp->cache_prev) {
 854                         if ((sp = umem_findslab(cp, buf)) != NULL)
 855                                 break;
 856                 }
 857         }
 858
 859         if (sp == NULL) {
 860                 cp = NULL;
 861                 error = UMERR_BADADDR;
 862         } else {
 863                 if (cp != cparg)
 864                         error = UMERR_BADCACHE;
 865                 else
 866                         buf = (char *)bufarg - ((uintptr_t)bufarg -
 867                             (uintptr_t)sp->slab_base) % cp->cache_chunksize;
 868                 if (buf != bufarg)
 869                         error = UMERR_BADBASE;
 870                 if (cp->cache_flags & UMF_BUFTAG)
 871                         btp = UMEM_BUFTAG(cp, buf);
 872                 if (cp->cache_flags & UMF_HASH) {
 873                         (void) mutex_lock(&cp->cache_lock);
 874                         for (bcp = *UMEM_HASH(cp, buf); bcp; bcp = bcp->bc_next)
 875                                 if (bcp->bc_addr == buf)
 876                                         break;
 877                         (void) mutex_unlock(&cp->cache_lock);
 878                         if (bcp == NULL && btp != NULL)
 879                                 bcp = btp->bt_bufctl;
 880                         if (umem_findslab(cp->cache_bufctl_cache, bcp) ==
 881                             NULL || P2PHASE((uintptr_t)bcp, UMEM_ALIGN) ||
 882                             bcp->bc_addr != buf) {
 883                                 error = UMERR_BADBUFCTL;
 884                                 bcp = NULL;
 885                         }
 886                 }
 887         }
 888
 889         umem_abort_info.ump_error = error;
 890         umem_abort_info.ump_buffer = bufarg;
 891         umem_abort_info.ump_realbuf = buf;
 892         umem_abort_info.ump_cache = cparg;
 893         umem_abort_info.ump_realcache = cp;
 894         umem_abort_info.ump_slab = sp;
 895         umem_abort_info.ump_bufctl = bcp;
 896
 897         umem_printf("umem allocator: ");
 898
 899         switch (error) {
 900
 901         case UMERR_MODIFIED:
 902                 umem_printf("buffer modified after being freed\n");
 903                 off = verify_pattern(UMEM_FREE_PATTERN, buf, cp->cache_verify);
 904                 if (off == NULL)        /* shouldn't happen */
 905                         off = buf;
 906                 umem_printf("modification occurred at offset 0x%lx "
 907                     "(0x%llx replaced by 0x%llx)\n",
 908                     (uintptr_t)off - (uintptr_t)buf,
 909                     (longlong_t)UMEM_FREE_PATTERN, (longlong_t)*off);
 910                 break;
 911
 912         case UMERR_REDZONE:
 913                 umem_printf("redzone violation: write past end of buffer\n");
 914                 break;
 915
 916         case UMERR_BADADDR:
 917                 umem_printf("invalid free: buffer not in cache\n");
 918                 break;
 919
 920         case UMERR_DUPFREE:
 921                 umem_printf("duplicate free: buffer freed twice\n");
 922                 break;
 923
 924         case UMERR_BADBUFTAG:
 925                 umem_printf("boundary tag corrupted\n");
 926                 umem_printf("bcp ^ bxstat = %lx, should be %lx\n",
 927                     (intptr_t)btp->bt_bufctl ^ btp->bt_bxstat,
 928                     UMEM_BUFTAG_FREE);
 929                 break;
 930
 931         case UMERR_BADBUFCTL:
 932                 umem_printf("bufctl corrupted\n");
 933                 break;
 934
 935         case UMERR_BADCACHE:
 936                 umem_printf("buffer freed to wrong cache\n");
 937                 umem_printf("buffer was allocated from %s,\n", cp->cache_name);
 938                 umem_printf("caller attempting free to %s.\n",
 939                     cparg->cache_name);
 940                 break;
 941
 942         case UMERR_BADSIZE:
 943                 umem_printf("bad free: free size (%u) != alloc size (%u)\n",
 944                     UMEM_SIZE_DECODE(((uint32_t *)btp)[0]),
 945                     UMEM_SIZE_DECODE(((uint32_t *)btp)[1]));
 946                 break;
 947
 948         case UMERR_BADBASE:
 949                 umem_printf("bad free: free address (%p) != alloc address "
 950                     "(%p)\n", bufarg, buf);
 951                 break;
 952         }
 953
 954         umem_printf("buffer=%p  bufctl=%p  cache: %s\n",
 955             bufarg, (void *)bcp, cparg->cache_name);
 956
 957         if (bcp != NULL && (cp->cache_flags & UMF_AUDIT) &&
 958             error != UMERR_BADBUFCTL) {
 959                 int d;
 960                 timespec_t ts;
 961                 hrtime_t diff;
 962                 umem_bufctl_audit_t *bcap = (umem_bufctl_audit_t *)bcp;
 963
 964                 diff = umem_abort_info.ump_timestamp - bcap->bc_timestamp;
 965                 ts.tv_sec = diff / NANOSEC;
 966                 ts.tv_nsec = diff % NANOSEC;
 967
 968                 umem_printf("previous transaction on buffer %p:\n", buf);
 969                 umem_printf("thread=%p  time=T-%ld.%09ld  slab=%p  cache: %s\n",
 970                     (void *)(intptr_t)bcap->bc_thread, ts.tv_sec, ts.tv_nsec,
 971                     (void *)sp, cp->cache_name);
 972                 for (d = 0; d < MIN(bcap->bc_depth, umem_stack_depth); d++) {
 973                         (void) print_sym((void *)bcap->bc_stack[d]);
 974                         umem_printf("\n");
 975                 }
 976         }
 977
 978         umem_err_recoverable("umem: heap corruption detected");
 979
 980         umem_logging = old_logging;     /* resume logging */
 981 }
 982
 983 void
 984 umem_nofail_callback(umem_nofail_callback_t *cb)
 985 {
 986         nofail_callback = cb;
 987 }
 988
 989 static int
 990 umem_alloc_retry(umem_cache_t *cp, int umflag)
 991 {
 992         if (cp == &umem_null_cache) {
 993                 if (umem_init())
 994                         return (1);                             /* retry */
 995                 /*
 996                  * Initialization failed.  Do normal failure processing.
 997                  */
 998         }
 999         if (umflag & UMEM_NOFAIL) {
1000                 int def_result = UMEM_CALLBACK_EXIT(255);
1001                 int result = def_result;
1002                 umem_nofail_callback_t *callback = nofail_callback;
1003
1004                 if (callback != NULL)
1005                         result = callback();
1006
1007                 if (result == UMEM_CALLBACK_RETRY)
1008                         return (1);
1009
1010                 if ((result & ~0xFF) != UMEM_CALLBACK_EXIT(0)) {
1011                         log_message("nofail callback returned %x\n", result);
1012                         result = def_result;
1013                 }
1014
1015                 /*
1016                  * only one thread will call exit
1017                  */
1018                 if (umem_nofail_exit_thr == thr_self())
1019                         umem_panic("recursive UMEM_CALLBACK_EXIT()\n");
1020
1021                 (void) mutex_lock(&umem_nofail_exit_lock);
1022                 umem_nofail_exit_thr = thr_self();
1023                 exit(result & 0xFF);
1024                 /*NOTREACHED*/
1025         }
1026         return (0);
1027 }
1028
1029 static umem_log_header_t *
1030 umem_log_init(size_t logsize)
1031 {
1032         umem_log_header_t *lhp;
1033         int nchunks = 4 * umem_max_ncpus;
1034         size_t lhsize = offsetof(umem_log_header_t, lh_cpu[umem_max_ncpus]);
1035         int i;
1036
1037         if (logsize == 0)
1038                 return (NULL);
1039
1040         /*
1041          * Make sure that lhp->lh_cpu[] is nicely aligned
1042          * to prevent false sharing of cache lines.
1043          */
1044         lhsize = P2ROUNDUP(lhsize, UMEM_ALIGN);
1045         lhp = vmem_xalloc(umem_log_arena, lhsize, 64, P2NPHASE(lhsize, 64), 0,
1046             NULL, NULL, VM_NOSLEEP);
1047         if (lhp == NULL)
1048                 goto fail;
1049
1050         bzero(lhp, lhsize);
1051
1052         (void) mutex_init(&lhp->lh_lock, USYNC_THREAD, NULL);
1053         lhp->lh_nchunks = nchunks;
1054         lhp->lh_chunksize = P2ROUNDUP(logsize / nchunks, PAGESIZE);
1055         if (lhp->lh_chunksize == 0)
1056                 lhp->lh_chunksize = PAGESIZE;
1057
1058         lhp->lh_base = vmem_alloc(umem_log_arena,
1059             lhp->lh_chunksize * nchunks, VM_NOSLEEP);
1060         if (lhp->lh_base == NULL)
1061                 goto fail;
1062
1063         lhp->lh_free = vmem_alloc(umem_log_arena,
1064             nchunks * sizeof (int), VM_NOSLEEP);
1065         if (lhp->lh_free == NULL)
1066                 goto fail;
1067
1068         bzero(lhp->lh_base, lhp->lh_chunksize * nchunks);
1069
1070         for (i = 0; i < umem_max_ncpus; i++) {
1071                 umem_cpu_log_header_t *clhp = &lhp->lh_cpu[i];
1072                 (void) mutex_init(&clhp->clh_lock, USYNC_THREAD, NULL);
1073                 clhp->clh_chunk = i;
1074         }
1075
1076         for (i = umem_max_ncpus; i < nchunks; i++)
1077                 lhp->lh_free[i] = i;
1078
1079         lhp->lh_head = umem_max_ncpus;
1080         lhp->lh_tail = 0;
1081
1082         return (lhp);
1083
1084 fail:
1085         if (lhp != NULL) {
1086                 if (lhp->lh_base != NULL)
1087                         vmem_free(umem_log_arena, lhp->lh_base,
1088                             lhp->lh_chunksize * nchunks);
1089
1090                 vmem_xfree(umem_log_arena, lhp, lhsize);
1091         }
1092         return (NULL);
1093 }
1094
1095 static void *
1096 umem_log_enter(umem_log_header_t *lhp, void *data, size_t size)
1097 {
1098         void *logspace;
1099         umem_cpu_log_header_t *clhp =
1100             &(lhp->lh_cpu[CPU(umem_cpu_mask)->cpu_number]);
1101
1102         if (lhp == NULL || umem_logging == 0)
1103                 return (NULL);
1104
1105         (void) mutex_lock(&clhp->clh_lock);
1106         clhp->clh_hits++;
1107         if (size > clhp->clh_avail) {
1108                 (void) mutex_lock(&lhp->lh_lock);
1109                 lhp->lh_hits++;
1110                 lhp->lh_free[lhp->lh_tail] = clhp->clh_chunk;
1111                 lhp->lh_tail = (lhp->lh_tail + 1) % lhp->lh_nchunks;
1112                 clhp->clh_chunk = lhp->lh_free[lhp->lh_head];
1113                 lhp->lh_head = (lhp->lh_head + 1) % lhp->lh_nchunks;
1114                 clhp->clh_current = lhp->lh_base +
1115                     clhp->clh_chunk * lhp->lh_chunksize;
1116                 clhp->clh_avail = lhp->lh_chunksize;
1117                 if (size > lhp->lh_chunksize)
1118                         size = lhp->lh_chunksize;
1119                 (void) mutex_unlock(&lhp->lh_lock);
1120         }
1121         logspace = clhp->clh_current;
1122         clhp->clh_current += size;
1123         clhp->clh_avail -= size;
1124         bcopy(data, logspace, size);
1125         (void) mutex_unlock(&clhp->clh_lock);
1126         return (logspace);
1127 }
1128
1129 #define UMEM_AUDIT(lp, cp, bcp)                                         \
1130 {                                                                       \
1131         umem_bufctl_audit_t *_bcp = (umem_bufctl_audit_t *)(bcp);       \
1132         _bcp->bc_timestamp = gethrtime();                               \
1133         _bcp->bc_thread = thr_self();                                   \
1134         _bcp->bc_depth = getpcstack(_bcp->bc_stack, umem_stack_depth,   \
1135             (cp != NULL) && (cp->cache_flags & UMF_CHECKSIGNAL));       \
1136         _bcp->bc_lastlog = umem_log_enter((lp), _bcp,                   \
1137             UMEM_BUFCTL_AUDIT_SIZE);                                    \
1138 }
1139
1140 static void
1141 umem_log_event(umem_log_header_t *lp, umem_cache_t *cp,
1142         umem_slab_t *sp, void *addr)
1143 {
1144         umem_bufctl_audit_t *bcp;
1145         UMEM_LOCAL_BUFCTL_AUDIT(&bcp);
1146
1147         bzero(bcp, UMEM_BUFCTL_AUDIT_SIZE);
1148         bcp->bc_addr = addr;
1149         bcp->bc_slab = sp;
1150         bcp->bc_cache = cp;
1151         UMEM_AUDIT(lp, cp, bcp);
1152 }
1153
1154 /*
1155  * Create a new slab for cache cp.
1156  */
1157 static umem_slab_t *
1158 umem_slab_create(umem_cache_t *cp, int umflag)
1159 {
1160         size_t slabsize = cp->cache_slabsize;
1161         size_t chunksize = cp->cache_chunksize;
1162         int cache_flags = cp->cache_flags;
1163         size_t color, chunks;
1164         char *buf, *slab;
1165         umem_slab_t *sp;
1166         umem_bufctl_t *bcp;
1167         vmem_t *vmp = cp->cache_arena;
1168
1169         color = cp->cache_color + cp->cache_align;
1170         if (color > cp->cache_maxcolor)
1171                 color = cp->cache_mincolor;
1172         cp->cache_color = color;
1173
1174         slab = vmem_alloc(vmp, slabsize, UMEM_VMFLAGS(umflag));
1175
1176         if (slab == NULL)
1177                 goto vmem_alloc_failure;
1178
1179         ASSERT(P2PHASE((uintptr_t)slab, vmp->vm_quantum) == 0);
1180
1181         if (!(cp->cache_cflags & UMC_NOTOUCH) &&
1182             (cp->cache_flags & UMF_DEADBEEF))
1183                 copy_pattern(UMEM_UNINITIALIZED_PATTERN, slab, slabsize);
1184
1185         if (cache_flags & UMF_HASH) {
1186                 if ((sp = _umem_cache_alloc(umem_slab_cache, umflag)) == NULL)
1187                         goto slab_alloc_failure;
1188                 chunks = (slabsize - color) / chunksize;
1189         } else {
1190                 sp = UMEM_SLAB(cp, slab);
1191                 chunks = (slabsize - sizeof (umem_slab_t) - color) / chunksize;
1192         }
1193
1194         sp->slab_cache  = cp;
1195         sp->slab_head   = NULL;
1196         sp->slab_refcnt = 0;
1197         sp->slab_base   = buf = slab + color;
1198         sp->slab_chunks = chunks;
1199
1200         ASSERT(chunks > 0);
1201         while (chunks-- != 0) {
1202                 if (cache_flags & UMF_HASH) {
1203                         bcp = _umem_cache_alloc(cp->cache_bufctl_cache, umflag);
1204                         if (bcp == NULL)
1205                                 goto bufctl_alloc_failure;
1206                         if (cache_flags & UMF_AUDIT) {
1207                                 umem_bufctl_audit_t *bcap =
1208                                     (umem_bufctl_audit_t *)bcp;
1209                                 bzero(bcap, UMEM_BUFCTL_AUDIT_SIZE);
1210                                 bcap->bc_cache = cp;
1211                         }
1212                         bcp->bc_addr = buf;
1213                         bcp->bc_slab = sp;
1214                 } else {
1215                         bcp = UMEM_BUFCTL(cp, buf);
1216                 }
1217                 if (cache_flags & UMF_BUFTAG) {
1218                         umem_buftag_t *btp = UMEM_BUFTAG(cp, buf);
1219                         btp->bt_redzone = UMEM_REDZONE_PATTERN;
1220                         btp->bt_bufctl = bcp;
1221                         btp->bt_bxstat = (intptr_t)bcp ^ UMEM_BUFTAG_FREE;
1222                         if (cache_flags & UMF_DEADBEEF) {
1223                                 copy_pattern(UMEM_FREE_PATTERN, buf,
1224                                     cp->cache_verify);
1225                         }
1226                 }
1227                 bcp->bc_next = sp->slab_head;
1228                 sp->slab_head = bcp;
1229                 buf += chunksize;
1230         }
1231
1232         umem_log_event(umem_slab_log, cp, sp, slab);
1233
1234         return (sp);
1235
1236 bufctl_alloc_failure:
1237
1238         while ((bcp = sp->slab_head) != NULL) {
1239                 sp->slab_head = bcp->bc_next;
1240                 _umem_cache_free(cp->cache_bufctl_cache, bcp);
1241         }
1242         _umem_cache_free(umem_slab_cache, sp);
1243
1244 slab_alloc_failure:
1245
1246         vmem_free(vmp, slab, slabsize);
1247
1248 vmem_alloc_failure:
1249
1250         umem_log_event(umem_failure_log, cp, NULL, NULL);
1251         atomic_add_64(&cp->cache_alloc_fail, 1);
1252
1253         return (NULL);
1254 }
1255
1256 /*
1257  * Destroy a slab.
1258  */
1259 static void
1260 umem_slab_destroy(umem_cache_t *cp, umem_slab_t *sp)
1261 {
1262         vmem_t *vmp = cp->cache_arena;
1263         void *slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, vmp->vm_quantum);
1264
1265         if (cp->cache_flags & UMF_HASH) {
1266                 umem_bufctl_t *bcp;
1267                 while ((bcp = sp->slab_head) != NULL) {
1268                         sp->slab_head = bcp->bc_next;
1269                         _umem_cache_free(cp->cache_bufctl_cache, bcp);
1270                 }
1271                 _umem_cache_free(umem_slab_cache, sp);
1272         }
1273         vmem_free(vmp, slab, cp->cache_slabsize);
1274 }
1275
1276 /*
1277  * Allocate a raw (unconstructed) buffer from cp's slab layer.
1278  */
1279 static void *
1280 umem_slab_alloc(umem_cache_t *cp, int umflag)
1281 {
1282         umem_bufctl_t *bcp, **hash_bucket;
1283         umem_slab_t *sp;
1284         void *buf;
1285
1286         (void) mutex_lock(&cp->cache_lock);
1287         cp->cache_slab_alloc++;
1288         sp = cp->cache_freelist;
1289         ASSERT(sp->slab_cache == cp);
1290         if (sp->slab_head == NULL) {
1291                 /*
1292                  * The freelist is empty.  Create a new slab.
1293                  */
1294                 (void) mutex_unlock(&cp->cache_lock);
1295                 if (cp == &umem_null_cache)
1296                         return (NULL);
1297                 if ((sp = umem_slab_create(cp, umflag)) == NULL)
1298                         return (NULL);
1299                 (void) mutex_lock(&cp->cache_lock);
1300                 cp->cache_slab_create++;
1301                 if ((cp->cache_buftotal += sp->slab_chunks) > cp->cache_bufmax)
1302                         cp->cache_bufmax = cp->cache_buftotal;
1303                 sp->slab_next = cp->cache_freelist;
1304                 sp->slab_prev = cp->cache_freelist->slab_prev;
1305                 sp->slab_next->slab_prev = sp;
1306                 sp->slab_prev->slab_next = sp;
1307                 cp->cache_freelist = sp;
1308         }
1309
1310         sp->slab_refcnt++;
1311         ASSERT(sp->slab_refcnt <= sp->slab_chunks);
1312
1313         /*
1314          * If we're taking the last buffer in the slab,
1315          * remove the slab from the cache's freelist.
1316          */
1317         bcp = sp->slab_head;
1318         if ((sp->slab_head = bcp->bc_next) == NULL) {
1319                 cp->cache_freelist = sp->slab_next;
1320                 ASSERT(sp->slab_refcnt == sp->slab_chunks);
1321         }
1322
1323         if (cp->cache_flags & UMF_HASH) {
1324                 /*
1325                  * Add buffer to allocated-address hash table.
1326                  */
1327                 buf = bcp->bc_addr;
1328                 hash_bucket = UMEM_HASH(cp, buf);
1329                 bcp->bc_next = *hash_bucket;
1330                 *hash_bucket = bcp;
1331                 if ((cp->cache_flags & (UMF_AUDIT | UMF_BUFTAG)) == UMF_AUDIT) {
1332                         UMEM_AUDIT(umem_transaction_log, cp, bcp);
1333                 }
1334         } else {
1335                 buf = UMEM_BUF(cp, bcp);
1336         }
1337
1338         ASSERT(UMEM_SLAB_MEMBER(sp, buf));
1339
1340         (void) mutex_unlock(&cp->cache_lock);
1341
1342         return (buf);
1343 }
1344
1345 /*
1346  * Free a raw (unconstructed) buffer to cp's slab layer.
1347  */
1348 static void
1349 umem_slab_free(umem_cache_t *cp, void *buf)
1350 {
1351         umem_slab_t *sp;
1352         umem_bufctl_t *bcp, **prev_bcpp;
1353
1354         ASSERT(buf != NULL);
1355
1356         (void) mutex_lock(&cp->cache_lock);
1357         cp->cache_slab_free++;
1358
1359         if (cp->cache_flags & UMF_HASH) {
1360                 /*
1361                  * Look up buffer in allocated-address hash table.
1362                  */
1363                 prev_bcpp = UMEM_HASH(cp, buf);
1364                 while ((bcp = *prev_bcpp) != NULL) {
1365                         if (bcp->bc_addr == buf) {
1366                                 *prev_bcpp = bcp->bc_next;
1367                                 sp = bcp->bc_slab;
1368                                 break;
1369                         }
1370                         cp->cache_lookup_depth++;
1371                         prev_bcpp = &bcp->bc_next;
1372                 }
1373         } else {
1374                 bcp = UMEM_BUFCTL(cp, buf);
1375                 sp = UMEM_SLAB(cp, buf);
1376         }
1377
1378         if (bcp == NULL || sp->slab_cache != cp || !UMEM_SLAB_MEMBER(sp, buf)) {
1379                 (void) mutex_unlock(&cp->cache_lock);
1380                 umem_error(UMERR_BADADDR, cp, buf);
1381                 return;
1382         }
1383
1384         if ((cp->cache_flags & (UMF_AUDIT | UMF_BUFTAG)) == UMF_AUDIT) {
1385                 if (cp->cache_flags & UMF_CONTENTS)
1386                         ((umem_bufctl_audit_t *)bcp)->bc_contents =
1387                             umem_log_enter(umem_content_log, buf,
1388                             cp->cache_contents);
1389                 UMEM_AUDIT(umem_transaction_log, cp, bcp);
1390         }
1391
1392         /*
1393          * If this slab isn't currently on the freelist, put it there.
1394          */
1395         if (sp->slab_head == NULL) {
1396                 ASSERT(sp->slab_refcnt == sp->slab_chunks);
1397                 ASSERT(cp->cache_freelist != sp);
1398                 sp->slab_next->slab_prev = sp->slab_prev;
1399                 sp->slab_prev->slab_next = sp->slab_next;
1400                 sp->slab_next = cp->cache_freelist;
1401                 sp->slab_prev = cp->cache_freelist->slab_prev;
1402                 sp->slab_next->slab_prev = sp;
1403                 sp->slab_prev->slab_next = sp;
1404                 cp->cache_freelist = sp;
1405         }
1406
1407         bcp->bc_next = sp->slab_head;
1408         sp->slab_head = bcp;
1409
1410         ASSERT(sp->slab_refcnt >= 1);
1411         if (--sp->slab_refcnt == 0) {
1412                 /*
1413                  * There are no outstanding allocations from this slab,
1414                  * so we can reclaim the memory.
1415                  */
1416                 sp->slab_next->slab_prev = sp->slab_prev;
1417                 sp->slab_prev->slab_next = sp->slab_next;
1418                 if (sp == cp->cache_freelist)
1419                         cp->cache_freelist = sp->slab_next;
1420                 cp->cache_slab_destroy++;
1421                 cp->cache_buftotal -= sp->slab_chunks;
1422                 (void) mutex_unlock(&cp->cache_lock);
1423                 umem_slab_destroy(cp, sp);
1424                 return;
1425         }
1426         (void) mutex_unlock(&cp->cache_lock);
1427 }
1428
1429 static int
1430 umem_cache_alloc_debug(umem_cache_t *cp, void *buf, int umflag)
1431 {
1432         umem_buftag_t *btp = UMEM_BUFTAG(cp, buf);
1433         umem_bufctl_audit_t *bcp = (umem_bufctl_audit_t *)btp->bt_bufctl;
1434         uint32_t mtbf;
1435         int flags_nfatal;
1436
1437         if (btp->bt_bxstat != ((intptr_t)bcp ^ UMEM_BUFTAG_FREE)) {
1438                 umem_error(UMERR_BADBUFTAG, cp, buf);
1439                 return (-1);
1440         }
1441
1442         btp->bt_bxstat = (intptr_t)bcp ^ UMEM_BUFTAG_ALLOC;
1443
1444         if ((cp->cache_flags & UMF_HASH) && bcp->bc_addr != buf) {
1445                 umem_error(UMERR_BADBUFCTL, cp, buf);
1446                 return (-1);
1447         }
1448
1449         btp->bt_redzone = UMEM_REDZONE_PATTERN;
1450
1451         if (cp->cache_flags & UMF_DEADBEEF) {
1452                 if (verify_and_copy_pattern(UMEM_FREE_PATTERN,
1453                     UMEM_UNINITIALIZED_PATTERN, buf, cp->cache_verify)) {
1454                         umem_error(UMERR_MODIFIED, cp, buf);
1455                         return (-1);
1456                 }
1457         }
1458
1459         if ((mtbf = umem_mtbf | cp->cache_mtbf) != 0 &&
1460             gethrtime() % mtbf == 0 &&
1461             (umflag & (UMEM_FATAL_FLAGS)) == 0) {
1462                 umem_log_event(umem_failure_log, cp, NULL, NULL);
1463         } else {
1464                 mtbf = 0;
1465         }
1466
1467         /*
1468          * We do not pass fatal flags on to the constructor.  This prevents
1469          * leaking buffers in the event of a subordinate constructor failing.
1470          */
1471         flags_nfatal = UMEM_DEFAULT;
1472         if (mtbf || (cp->cache_constructor != NULL &&
1473             cp->cache_constructor(buf, cp->cache_private, flags_nfatal) != 0)) {
1474                 atomic_add_64(&cp->cache_alloc_fail, 1);
1475                 btp->bt_bxstat = (intptr_t)bcp ^ UMEM_BUFTAG_FREE;
1476                 copy_pattern(UMEM_FREE_PATTERN, buf, cp->cache_verify);
1477                 umem_slab_free(cp, buf);
1478                 return (-1);
1479         }
1480
1481         if (cp->cache_flags & UMF_AUDIT) {
1482                 UMEM_AUDIT(umem_transaction_log, cp, bcp);
1483         }
1484
1485         return (0);
1486 }
1487
1488 static int
1489 umem_cache_free_debug(umem_cache_t *cp, void *buf)
1490 {
1491         umem_buftag_t *btp = UMEM_BUFTAG(cp, buf);
1492         umem_bufctl_audit_t *bcp = (umem_bufctl_audit_t *)btp->bt_bufctl;
1493         umem_slab_t *sp;
1494
1495         if (btp->bt_bxstat != ((intptr_t)bcp ^ UMEM_BUFTAG_ALLOC)) {
1496                 if (btp->bt_bxstat == ((intptr_t)bcp ^ UMEM_BUFTAG_FREE)) {
1497                         umem_error(UMERR_DUPFREE, cp, buf);
1498                         return (-1);
1499                 }
1500                 sp = umem_findslab(cp, buf);
1501                 if (sp == NULL || sp->slab_cache != cp)
1502                         umem_error(UMERR_BADADDR, cp, buf);
1503                 else
1504                         umem_error(UMERR_REDZONE, cp, buf);
1505                 return (-1);
1506         }
1507
1508         btp->bt_bxstat = (intptr_t)bcp ^ UMEM_BUFTAG_FREE;
1509
1510         if ((cp->cache_flags & UMF_HASH) && bcp->bc_addr != buf) {
1511                 umem_error(UMERR_BADBUFCTL, cp, buf);
1512                 return (-1);
1513         }
1514
1515         if (btp->bt_redzone != UMEM_REDZONE_PATTERN) {
1516                 umem_error(UMERR_REDZONE, cp, buf);
1517                 return (-1);
1518         }
1519
1520         if (cp->cache_flags & UMF_AUDIT) {
1521                 if (cp->cache_flags & UMF_CONTENTS)
1522                         bcp->bc_contents = umem_log_enter(umem_content_log,
1523                             buf, cp->cache_contents);
1524                 UMEM_AUDIT(umem_transaction_log, cp, bcp);
1525         }
1526
1527         if (cp->cache_destructor != NULL)
1528                 cp->cache_destructor(buf, cp->cache_private);
1529
1530         if (cp->cache_flags & UMF_DEADBEEF)
1531                 copy_pattern(UMEM_FREE_PATTERN, buf, cp->cache_verify);
1532
1533         return (0);
1534 }
1535
1536 /*
1537  * Free each object in magazine mp to cp's slab layer, and free mp itself.
1538  */
1539 static void
1540 umem_magazine_destroy(umem_cache_t *cp, umem_magazine_t *mp, int nrounds)
1541 {
1542         int round;
1543
1544         ASSERT(cp->cache_next == NULL || IN_UPDATE());
1545
1546         for (round = 0; round < nrounds; round++) {
1547                 void *buf = mp->mag_round[round];
1548
1549                 if ((cp->cache_flags & UMF_DEADBEEF) &&
1550                     verify_pattern(UMEM_FREE_PATTERN, buf,
1551                     cp->cache_verify) != NULL) {
1552                         umem_error(UMERR_MODIFIED, cp, buf);
1553                         continue;
1554                 }
1555
1556                 if (!(cp->cache_flags & UMF_BUFTAG) &&
1557                     cp->cache_destructor != NULL)
1558                         cp->cache_destructor(buf, cp->cache_private);
1559
1560                 umem_slab_free(cp, buf);
1561         }
1562         ASSERT(UMEM_MAGAZINE_VALID(cp, mp));
1563         _umem_cache_free(cp->cache_magtype->mt_cache, mp);
1564 }
1565
1566 /*
1567  * Allocate a magazine from the depot.
1568  */
1569 static umem_magazine_t *
1570 umem_depot_alloc(umem_cache_t *cp, umem_maglist_t *mlp)
1571 {
1572         umem_magazine_t *mp;
1573
1574         /*
1575          * If we can't get the depot lock without contention,
1576          * update our contention count.  We use the depot
1577          * contention rate to determine whether we need to
1578          * increase the magazine size for better scalability.
1579          */
1580         if (mutex_trylock(&cp->cache_depot_lock) != 0) {
1581                 (void) mutex_lock(&cp->cache_depot_lock);
1582                 cp->cache_depot_contention++;
1583         }
1584
1585         if ((mp = mlp->ml_list) != NULL) {
1586                 ASSERT(UMEM_MAGAZINE_VALID(cp, mp));
1587                 mlp->ml_list = mp->mag_next;
1588                 if (--mlp->ml_total < mlp->ml_min)
1589                         mlp->ml_min = mlp->ml_total;
1590                 mlp->ml_alloc++;
1591         }
1592
1593         (void) mutex_unlock(&cp->cache_depot_lock);
1594
1595         return (mp);
1596 }
1597
1598 /*
1599  * Free a magazine to the depot.
1600  */
1601 static void
1602 umem_depot_free(umem_cache_t *cp, umem_maglist_t *mlp, umem_magazine_t *mp)
1603 {
1604         (void) mutex_lock(&cp->cache_depot_lock);
1605         ASSERT(UMEM_MAGAZINE_VALID(cp, mp));
1606         mp->mag_next = mlp->ml_list;
1607         mlp->ml_list = mp;
1608         mlp->ml_total++;
1609         (void) mutex_unlock(&cp->cache_depot_lock);
1610 }
1611
1612 /*
1613  * Update the working set statistics for cp's depot.
1614  */
1615 static void
1616 umem_depot_ws_update(umem_cache_t *cp)
1617 {
1618         (void) mutex_lock(&cp->cache_depot_lock);
1619         cp->cache_full.ml_reaplimit = cp->cache_full.ml_min;
1620         cp->cache_full.ml_min = cp->cache_full.ml_total;
1621         cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_min;
1622         cp->cache_empty.ml_min = cp->cache_empty.ml_total;
1623         (void) mutex_unlock(&cp->cache_depot_lock);
1624 }
1625
1626 /*
1627  * Reap all magazines that have fallen out of the depot's working set.
1628  */
1629 static void
1630 umem_depot_ws_reap(umem_cache_t *cp)
1631 {
1632         long reap;
1633         umem_magazine_t *mp;
1634
1635         ASSERT(cp->cache_next == NULL || IN_REAP());
1636
1637         reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min);
1638         while (reap-- && (mp = umem_depot_alloc(cp, &cp->cache_full)) != NULL)
1639                 umem_magazine_destroy(cp, mp, cp->cache_magtype->mt_magsize);
1640
1641         reap = MIN(cp->cache_empty.ml_reaplimit, cp->cache_empty.ml_min);
1642         while (reap-- && (mp = umem_depot_alloc(cp, &cp->cache_empty)) != NULL)
1643                 umem_magazine_destroy(cp, mp, 0);
1644 }
1645
1646 static void
1647 umem_cpu_reload(umem_cpu_cache_t *ccp, umem_magazine_t *mp, int rounds)
1648 {
1649         ASSERT((ccp->cc_loaded == NULL && ccp->cc_rounds == -1) ||
1650             (ccp->cc_loaded && ccp->cc_rounds + rounds == ccp->cc_magsize));
1651         ASSERT(ccp->cc_magsize > 0);
1652
1653         ccp->cc_ploaded = ccp->cc_loaded;
1654         ccp->cc_prounds = ccp->cc_rounds;
1655         ccp->cc_loaded = mp;
1656         ccp->cc_rounds = rounds;
1657 }
1658
1659 /*
1660  * Allocate a constructed object from cache cp.
1661  */
1662 #ifndef NO_WEAK_SYMBOLS
1663 #pragma weak umem_cache_alloc = _umem_cache_alloc
1664 #endif
1665 void *
1666 _umem_cache_alloc(umem_cache_t *cp, int umflag)
1667 {
1668         umem_cpu_cache_t *ccp;
1669         umem_magazine_t *fmp;
1670         void *buf;
1671         int flags_nfatal;
1672
1673 retry:
1674         ccp = UMEM_CPU_CACHE(cp, CPU(cp->cache_cpu_mask));
1675         (void) mutex_lock(&ccp->cc_lock);
1676         for (;;) {
1677                 /*
1678                  * If there's an object available in the current CPU's
1679                  * loaded magazine, just take it and return.
1680                  */
1681                 if (ccp->cc_rounds > 0) {
1682                         buf = ccp->cc_loaded->mag_round[--ccp->cc_rounds];
1683                         ccp->cc_alloc++;
1684                         (void) mutex_unlock(&ccp->cc_lock);
1685                         if ((ccp->cc_flags & UMF_BUFTAG) &&
1686                             umem_cache_alloc_debug(cp, buf, umflag) == -1) {
1687                                 if (umem_alloc_retry(cp, umflag)) {
1688                                         goto retry;
1689                                 }
1690
1691                                 return (NULL);
1692                         }
1693                         return (buf);
1694                 }
1695
1696                 /*
1697                  * The loaded magazine is empty.  If the previously loaded
1698                  * magazine was full, exchange them and try again.
1699                  */
1700                 if (ccp->cc_prounds > 0) {
1701                         umem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds);
1702                         continue;
1703                 }
1704
1705                 /*
1706                  * If the magazine layer is disabled, break out now.
1707                  */
1708                 if (ccp->cc_magsize == 0)
1709                         break;
1710
1711                 /*
1712                  * Try to get a full magazine from the depot.
1713                  */
1714                 fmp = umem_depot_alloc(cp, &cp->cache_full);
1715                 if (fmp != NULL) {
1716                         if (ccp->cc_ploaded != NULL)
1717                                 umem_depot_free(cp, &cp->cache_empty,
1718                                     ccp->cc_ploaded);
1719                         umem_cpu_reload(ccp, fmp, ccp->cc_magsize);
1720                         continue;
1721                 }
1722
1723                 /*
1724                  * There are no full magazines in the depot,
1725                  * so fall through to the slab layer.
1726                  */
1727                 break;
1728         }
1729         (void) mutex_unlock(&ccp->cc_lock);
1730
1731         /*
1732          * We couldn't allocate a constructed object from the magazine layer,
1733          * so get a raw buffer from the slab layer and apply its constructor.
1734          */
1735         buf = umem_slab_alloc(cp, umflag);
1736
1737         if (buf == NULL) {
1738                 if (cp == &umem_null_cache)
1739                         return (NULL);
1740                 if (umem_alloc_retry(cp, umflag)) {
1741                         goto retry;
1742                 }
1743
1744                 return (NULL);
1745         }
1746
1747         if (cp->cache_flags & UMF_BUFTAG) {
1748                 /*
1749                  * Let umem_cache_alloc_debug() apply the constructor for us.
1750                  */
1751                 if (umem_cache_alloc_debug(cp, buf, umflag) == -1) {
1752                         if (umem_alloc_retry(cp, umflag)) {
1753                                 goto retry;
1754                         }
1755                         return (NULL);
1756                 }
1757                 return (buf);
1758         }
1759
1760         /*
1761          * We do not pass fatal flags on to the constructor.  This prevents
1762          * leaking buffers in the event of a subordinate constructor failing.
1763          */
1764         flags_nfatal = UMEM_DEFAULT;
1765         if (cp->cache_constructor != NULL &&
1766             cp->cache_constructor(buf, cp->cache_private, flags_nfatal) != 0) {
1767                 atomic_add_64(&cp->cache_alloc_fail, 1);
1768                 umem_slab_free(cp, buf);
1769
1770                 if (umem_alloc_retry(cp, umflag)) {
1771                         goto retry;
1772                 }
1773                 return (NULL);
1774         }
1775
1776         return (buf);
1777 }
1778
1779 /*
1780  * Free a constructed object to cache cp.
1781  */
1782 #ifndef NO_WEAK_SYMBOLS
1783 #pragma weak umem_cache_free = _umem_cache_free
1784 #endif
1785 void
1786 _umem_cache_free(umem_cache_t *cp, void *buf)
1787 {
1788         umem_cpu_cache_t *ccp = UMEM_CPU_CACHE(cp, CPU(cp->cache_cpu_mask));
1789         umem_magazine_t *emp;
1790         umem_magtype_t *mtp;
1791
1792         if (ccp->cc_flags & UMF_BUFTAG)
1793                 if (umem_cache_free_debug(cp, buf) == -1)
1794                         return;
1795
1796         (void) mutex_lock(&ccp->cc_lock);
1797         for (;;) {
1798                 /*
1799                  * If there's a slot available in the current CPU's
1800                  * loaded magazine, just put the object there and return.
1801                  */
1802                 if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) {
1803                         ccp->cc_loaded->mag_round[ccp->cc_rounds++] = buf;
1804                         ccp->cc_free++;
1805                         (void) mutex_unlock(&ccp->cc_lock);
1806                         return;
1807                 }
1808
1809                 /*
1810                  * The loaded magazine is full.  If the previously loaded
1811                  * magazine was empty, exchange them and try again.
1812                  */
1813                 if (ccp->cc_prounds == 0) {
1814                         umem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds);
1815                         continue;
1816                 }
1817
1818                 /*
1819                  * If the magazine layer is disabled, break out now.
1820                  */
1821                 if (ccp->cc_magsize == 0)
1822                         break;
1823
1824                 /*
1825                  * Try to get an empty magazine from the depot.
1826                  */
1827                 emp = umem_depot_alloc(cp, &cp->cache_empty);
1828                 if (emp != NULL) {
1829                         if (ccp->cc_ploaded != NULL)
1830                                 umem_depot_free(cp, &cp->cache_full,
1831                                     ccp->cc_ploaded);
1832                         umem_cpu_reload(ccp, emp, 0);
1833                         continue;
1834                 }
1835
1836                 /*
1837                  * There are no empty magazines in the depot,
1838                  * so try to allocate a new one.  We must drop all locks
1839                  * across umem_cache_alloc() because lower layers may
1840                  * attempt to allocate from this cache.
1841                  */
1842                 mtp = cp->cache_magtype;
1843                 (void) mutex_unlock(&ccp->cc_lock);
1844                 emp = _umem_cache_alloc(mtp->mt_cache, UMEM_DEFAULT);
1845                 (void) mutex_lock(&ccp->cc_lock);
1846
1847                 if (emp != NULL) {
1848                         /*
1849                          * We successfully allocated an empty magazine.
1850                          * However, we had to drop ccp->cc_lock to do it,
1851                          * so the cache's magazine size may have changed.
1852                          * If so, free the magazine and try again.
1853                          */
1854                         if (ccp->cc_magsize != mtp->mt_magsize) {
1855                                 (void) mutex_unlock(&ccp->cc_lock);
1856                                 _umem_cache_free(mtp->mt_cache, emp);
1857                                 (void) mutex_lock(&ccp->cc_lock);
1858                                 continue;
1859                         }
1860
1861                         /*
1862                          * We got a magazine of the right size.  Add it to
1863                          * the depot and try the whole dance again.
1864                          */
1865                         umem_depot_free(cp, &cp->cache_empty, emp);
1866                         continue;
1867                 }
1868
1869                 /*
1870                  * We couldn't allocate an empty magazine,
1871                  * so fall through to the slab layer.
1872                  */
1873                 break;
1874         }
1875         (void) mutex_unlock(&ccp->cc_lock);
1876
1877         /*
1878          * We couldn't free our constructed object to the magazine layer,
1879          * so apply its destructor and free it to the slab layer.
1880          * Note that if UMF_BUFTAG is in effect, umem_cache_free_debug()
1881          * will have already applied the destructor.
1882          */
1883         if (!(cp->cache_flags & UMF_BUFTAG) && cp->cache_destructor != NULL)
1884                 cp->cache_destructor(buf, cp->cache_private);
1885
1886         umem_slab_free(cp, buf);
1887 }
1888
1889 #ifndef NO_WEAK_SYMBOLS
1890 #pragma weak umem_zalloc = _umem_zalloc
1891 #endif
1892 void *
1893 _umem_zalloc(size_t size, int umflag)
1894 {
1895         size_t index = (size - 1) >> UMEM_ALIGN_SHIFT;
1896         void *buf;
1897
1898 retry:
1899         if (index < UMEM_MAXBUF >> UMEM_ALIGN_SHIFT) {
1900                 umem_cache_t *cp = umem_alloc_table[index];
1901                 buf = _umem_cache_alloc(cp, umflag);
1902                 if (buf != NULL) {
1903                         if (cp->cache_flags & UMF_BUFTAG) {
1904                                 umem_buftag_t *btp = UMEM_BUFTAG(cp, buf);
1905                                 ((uint8_t *)buf)[size] = UMEM_REDZONE_BYTE;
1906                                 ((uint32_t *)btp)[1] = UMEM_SIZE_ENCODE(size);
1907                         }
1908                         bzero(buf, size);
1909                 } else if (umem_alloc_retry(cp, umflag))
1910                         goto retry;
1911         } else {
1912                 buf = _umem_alloc(size, umflag);        /* handles failure */
1913                 if (buf != NULL)
1914                         bzero(buf, size);
1915         }
1916         return (buf);
1917 }
1918
1919 #ifndef NO_WEAK_SYMBOLS
1920 #pragma weak umem_alloc = _umem_alloc
1921 #endif
1922 void *
1923 _umem_alloc(size_t size, int umflag)
1924 {
1925         size_t index = (size - 1) >> UMEM_ALIGN_SHIFT;
1926         void *buf;
1927 umem_alloc_retry:
1928         if (index < UMEM_MAXBUF >> UMEM_ALIGN_SHIFT) {
1929                 umem_cache_t *cp = umem_alloc_table[index];
1930                 buf = _umem_cache_alloc(cp, umflag);
1931                 if ((cp->cache_flags & UMF_BUFTAG) && buf != NULL) {
1932                         umem_buftag_t *btp = UMEM_BUFTAG(cp, buf);
1933                         ((uint8_t *)buf)[size] = UMEM_REDZONE_BYTE;
1934                         ((uint32_t *)btp)[1] = UMEM_SIZE_ENCODE(size);
1935                 }
1936                 if (buf == NULL && umem_alloc_retry(cp, umflag))
1937                         goto umem_alloc_retry;
1938                 return (buf);
1939         }
1940         if (size == 0)
1941                 return (NULL);
1942         if (umem_oversize_arena == NULL) {
1943                 if (umem_init())
1944                         ASSERT(umem_oversize_arena != NULL);
1945                 else
1946                         return (NULL);
1947         }
1948         buf = vmem_alloc(umem_oversize_arena, size, UMEM_VMFLAGS(umflag));
1949         if (buf == NULL) {
1950                 umem_log_event(umem_failure_log, NULL, NULL, (void *)size);
1951                 if (umem_alloc_retry(NULL, umflag))
1952                         goto umem_alloc_retry;
1953         }
1954         return (buf);
1955 }
1956
1957 #ifndef NO_WEAK_SYMBOLS
1958 #pragma weak umem_alloc_align = _umem_alloc_align
1959 #endif
1960 void *
1961 _umem_alloc_align(size_t size, size_t align, int umflag)
1962 {
1963         void *buf;
1964
1965         if (size == 0)
1966                 return (NULL);
1967         if ((align & (align - 1)) != 0)
1968                 return (NULL);
1969         if (align < UMEM_ALIGN)
1970                 align = UMEM_ALIGN;
1971
1972 umem_alloc_align_retry:
1973         if (umem_memalign_arena == NULL) {
1974                 if (umem_init())
1975                         ASSERT(umem_oversize_arena != NULL);
1976                 else
1977                         return (NULL);
1978         }
1979         buf = vmem_xalloc(umem_memalign_arena, size, align, 0, 0, NULL, NULL,
1980             UMEM_VMFLAGS(umflag));
1981         if (buf == NULL) {
1982                 umem_log_event(umem_failure_log, NULL, NULL, (void *)size);
1983                 if (umem_alloc_retry(NULL, umflag))
1984                         goto umem_alloc_align_retry;
1985         }
1986         return (buf);
1987 }
1988
1989 #ifndef NO_WEAK_SYMBOLS
1990 #pragma weak umem_free = _umem_free
1991 #endif
1992 void
1993 _umem_free(void *buf, size_t size)
1994 {
1995         size_t index = (size - 1) >> UMEM_ALIGN_SHIFT;
1996
1997         if (index < UMEM_MAXBUF >> UMEM_ALIGN_SHIFT) {
1998                 umem_cache_t *cp = umem_alloc_table[index];
1999                 if (cp->cache_flags & UMF_BUFTAG) {
2000                         umem_buftag_t *btp = UMEM_BUFTAG(cp, buf);
2001                         uint32_t *ip = (uint32_t *)btp;
2002                         if (ip[1] != UMEM_SIZE_ENCODE(size)) {
2003                                 if (*(uint64_t *)buf == UMEM_FREE_PATTERN) {
2004                                         umem_error(UMERR_DUPFREE, cp, buf);
2005                                         return;
2006                                 }
2007                                 if (UMEM_SIZE_VALID(ip[1])) {
2008                                         ip[0] = UMEM_SIZE_ENCODE(size);
2009                                         umem_error(UMERR_BADSIZE, cp, buf);
2010                                 } else {
2011                                         umem_error(UMERR_REDZONE, cp, buf);
2012                                 }
2013                                 return;
2014                         }
2015                         if (((uint8_t *)buf)[size] != UMEM_REDZONE_BYTE) {
2016                                 umem_error(UMERR_REDZONE, cp, buf);
2017                                 return;
2018                         }
2019                         btp->bt_redzone = UMEM_REDZONE_PATTERN;
2020                 }
2021                 _umem_cache_free(cp, buf);
2022         } else {
2023                 if (buf == NULL && size == 0)
2024                         return;
2025                 vmem_free(umem_oversize_arena, buf, size);
2026         }
2027 }
2028
2029 #ifndef NO_WEAK_SYMBOLS
2030 #pragma weak umem_free_align = _umem_free_align
2031 #endif
2032 void
2033 _umem_free_align(void *buf, size_t size)
2034 {
2035         if (buf == NULL && size == 0)
2036                 return;
2037         vmem_xfree(umem_memalign_arena, buf, size);
2038 }
2039
2040 static void *
2041 umem_firewall_va_alloc(vmem_t *vmp, size_t size, int vmflag)
2042 {
2043         size_t realsize = size + vmp->vm_quantum;
2044
2045         /*
2046          * Annoying edge case: if 'size' is just shy of ULONG_MAX, adding
2047          * vm_quantum will cause integer wraparound.  Check for this, and
2048          * blow off the firewall page in this case.  Note that such a
2049          * giant allocation (the entire address space) can never be
2050          * satisfied, so it will either fail immediately (VM_NOSLEEP)
2051          * or sleep forever (VM_SLEEP).  Thus, there is no need for a
2052          * corresponding check in umem_firewall_va_free().
2053          */
2054         if (realsize < size)
2055                 realsize = size;
2056
2057         return (vmem_alloc(vmp, realsize, vmflag | VM_NEXTFIT));
2058 }
2059
2060 static void
2061 umem_firewall_va_free(vmem_t *vmp, void *addr, size_t size)
2062 {
2063         vmem_free(vmp, addr, size + vmp->vm_quantum);
2064 }
2065
2066 /*
2067  * Reclaim all unused memory from a cache.
2068  */
2069 static void
2070 umem_cache_reap(umem_cache_t *cp)
2071 {
2072         /*
2073          * Ask the cache's owner to free some memory if possible.
2074          * The idea is to handle things like the inode cache, which
2075          * typically sits on a bunch of memory that it doesn't truly
2076          * *need*.  Reclaim policy is entirely up to the owner; this
2077          * callback is just an advisory plea for help.
2078          */
2079         if (cp->cache_reclaim != NULL)
2080                 cp->cache_reclaim(cp->cache_private);
2081
2082         umem_depot_ws_reap(cp);
2083 }
2084
2085 /*
2086  * Purge all magazines from a cache and set its magazine limit to zero.
2087  * All calls are serialized by being done by the update thread, except for
2088  * the final call from umem_cache_destroy().
2089  */
2090 static void
2091 umem_cache_magazine_purge(umem_cache_t *cp)
2092 {
2093         umem_cpu_cache_t *ccp;
2094         umem_magazine_t *mp, *pmp;
2095         int rounds, prounds, cpu_seqid;
2096
2097         ASSERT(cp->cache_next == NULL || IN_UPDATE());
2098
2099         for (cpu_seqid = 0; cpu_seqid < umem_max_ncpus; cpu_seqid++) {
2100                 ccp = &cp->cache_cpu[cpu_seqid];
2101
2102                 (void) mutex_lock(&ccp->cc_lock);
2103                 mp = ccp->cc_loaded;
2104                 pmp = ccp->cc_ploaded;
2105                 rounds = ccp->cc_rounds;
2106                 prounds = ccp->cc_prounds;
2107                 ccp->cc_loaded = NULL;
2108                 ccp->cc_ploaded = NULL;
2109                 ccp->cc_rounds = -1;
2110                 ccp->cc_prounds = -1;
2111                 ccp->cc_magsize = 0;
2112                 (void) mutex_unlock(&ccp->cc_lock);
2113
2114                 if (mp)
2115                         umem_magazine_destroy(cp, mp, rounds);
2116                 if (pmp)
2117                         umem_magazine_destroy(cp, pmp, prounds);
2118         }
2119
2120         /*
2121          * Updating the working set statistics twice in a row has the
2122          * effect of setting the working set size to zero, so everything
2123          * is eligible for reaping.
2124          */
2125         umem_depot_ws_update(cp);
2126         umem_depot_ws_update(cp);
2127
2128         umem_depot_ws_reap(cp);
2129 }
2130
2131 /*
2132  * Enable per-cpu magazines on a cache.
2133  */
2134 static void
2135 umem_cache_magazine_enable(umem_cache_t *cp)
2136 {
2137         int cpu_seqid;
2138
2139         if (cp->cache_flags & UMF_NOMAGAZINE)
2140                 return;
2141
2142         for (cpu_seqid = 0; cpu_seqid < umem_max_ncpus; cpu_seqid++) {
2143                 umem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
2144                 (void) mutex_lock(&ccp->cc_lock);
2145                 ccp->cc_magsize = cp->cache_magtype->mt_magsize;
2146                 (void) mutex_unlock(&ccp->cc_lock);
2147         }
2148
2149 }
2150
2151 /*
2152  * Recompute a cache's magazine size.  The trade-off is that larger magazines
2153  * provide a higher transfer rate with the depot, while smaller magazines
2154  * reduce memory consumption.  Magazine resizing is an expensive operation;
2155  * it should not be done frequently.
2156  *
2157  * Changes to the magazine size are serialized by only having one thread
2158  * doing updates. (the update thread)
2159  *
2160  * Note: at present this only grows the magazine size.  It might be useful
2161  * to allow shrinkage too.
2162  */
2163 static void
2164 umem_cache_magazine_resize(umem_cache_t *cp)
2165 {
2166         umem_magtype_t *mtp = cp->cache_magtype;
2167
2168         ASSERT(IN_UPDATE());
2169
2170         if (cp->cache_chunksize < mtp->mt_maxbuf) {
2171                 umem_cache_magazine_purge(cp);
2172                 (void) mutex_lock(&cp->cache_depot_lock);
2173                 cp->cache_magtype = ++mtp;
2174                 cp->cache_depot_contention_prev =
2175                     cp->cache_depot_contention + INT_MAX;
2176                 (void) mutex_unlock(&cp->cache_depot_lock);
2177                 umem_cache_magazine_enable(cp);
2178         }
2179 }
2180
2181 /*
2182  * Rescale a cache's hash table, so that the table size is roughly the
2183  * cache size.  We want the average lookup time to be extremely small.
2184  */
2185 static void
2186 umem_hash_rescale(umem_cache_t *cp)
2187 {
2188         umem_bufctl_t **old_table, **new_table, *bcp;
2189         size_t old_size, new_size, h;
2190
2191         ASSERT(IN_UPDATE());
2192
2193         new_size = MAX(UMEM_HASH_INITIAL,
2194             1 << (highbit(3 * cp->cache_buftotal + 4) - 2));
2195         old_size = cp->cache_hash_mask + 1;
2196
2197         if ((old_size >> 1) <= new_size && new_size <= (old_size << 1))
2198                 return;
2199
2200         new_table = vmem_alloc(umem_hash_arena, new_size * sizeof (void *),
2201             VM_NOSLEEP);
2202         if (new_table == NULL)
2203                 return;
2204         bzero(new_table, new_size * sizeof (void *));
2205
2206         (void) mutex_lock(&cp->cache_lock);
2207
2208         old_size = cp->cache_hash_mask + 1;
2209         old_table = cp->cache_hash_table;
2210
2211         cp->cache_hash_mask = new_size - 1;
2212         cp->cache_hash_table = new_table;
2213         cp->cache_rescale++;
2214
2215         for (h = 0; h < old_size; h++) {
2216                 bcp = old_table[h];
2217                 while (bcp != NULL) {
2218                         void *addr = bcp->bc_addr;
2219                         umem_bufctl_t *next_bcp = bcp->bc_next;
2220                         umem_bufctl_t **hash_bucket = UMEM_HASH(cp, addr);
2221                         bcp->bc_next = *hash_bucket;
2222                         *hash_bucket = bcp;
2223                         bcp = next_bcp;
2224                 }
2225         }
2226
2227         (void) mutex_unlock(&cp->cache_lock);
2228
2229         vmem_free(umem_hash_arena, old_table, old_size * sizeof (void *));
2230 }
2231
2232 /*
2233  * Perform periodic maintenance on a cache: hash rescaling,
2234  * depot working-set update, and magazine resizing.
2235  */
2236 void
2237 umem_cache_update(umem_cache_t *cp)
2238 {
2239         int update_flags = 0;
2240
2241         ASSERT(MUTEX_HELD(&umem_cache_lock));
2242
2243         /*
2244          * If the cache has become much larger or smaller than its hash table,
2245          * fire off a request to rescale the hash table.
2246          */
2247         (void) mutex_lock(&cp->cache_lock);
2248
2249         if ((cp->cache_flags & UMF_HASH) &&
2250             (cp->cache_buftotal > (cp->cache_hash_mask << 1) ||
2251             (cp->cache_buftotal < (cp->cache_hash_mask >> 1) &&
2252             cp->cache_hash_mask > UMEM_HASH_INITIAL)))
2253                 update_flags |= UMU_HASH_RESCALE;
2254
2255         (void) mutex_unlock(&cp->cache_lock);
2256
2257         /*
2258          * Update the depot working set statistics.
2259          */
2260         umem_depot_ws_update(cp);
2261
2262         /*
2263          * If there's a lot of contention in the depot,
2264          * increase the magazine size.
2265          */
2266         (void) mutex_lock(&cp->cache_depot_lock);
2267
2268         if (cp->cache_chunksize < cp->cache_magtype->mt_maxbuf &&
2269             (int)(cp->cache_depot_contention -
2270             cp->cache_depot_contention_prev) > umem_depot_contention)
2271                 update_flags |= UMU_MAGAZINE_RESIZE;
2272
2273         cp->cache_depot_contention_prev = cp->cache_depot_contention;
2274
2275         (void) mutex_unlock(&cp->cache_depot_lock);
2276
2277         if (update_flags)
2278                 umem_add_update(cp, update_flags);
2279 }
2280
2281 /*
2282  * Runs all pending updates.
2283  *
2284  * The update lock must be held on entrance, and will be held on exit.
2285  */
2286 void
2287 umem_process_updates(void)
2288 {
2289         ASSERT(MUTEX_HELD(&umem_update_lock));
2290
2291         while (umem_null_cache.cache_unext != &umem_null_cache) {
2292                 int notify = 0;
2293                 umem_cache_t *cp = umem_null_cache.cache_unext;
2294
2295                 cp->cache_uprev->cache_unext = cp->cache_unext;
2296                 cp->cache_unext->cache_uprev = cp->cache_uprev;
2297                 cp->cache_uprev = cp->cache_unext = NULL;
2298
2299                 ASSERT(!(cp->cache_uflags & UMU_ACTIVE));
2300
2301                 while (cp->cache_uflags) {
2302                         int uflags = (cp->cache_uflags |= UMU_ACTIVE);
2303                         (void) mutex_unlock(&umem_update_lock);
2304
2305                         /*
2306                          * The order here is important.  Each step can speed up
2307                          * later steps.
2308                          */
2309
2310                         if (uflags & UMU_HASH_RESCALE)
2311                                 umem_hash_rescale(cp);
2312
2313                         if (uflags & UMU_MAGAZINE_RESIZE)
2314                                 umem_cache_magazine_resize(cp);
2315
2316                         if (uflags & UMU_REAP)
2317                                 umem_cache_reap(cp);
2318
2319                         (void) mutex_lock(&umem_update_lock);
2320
2321                         /*
2322                          * check if anyone has requested notification
2323                          */
2324                         if (cp->cache_uflags & UMU_NOTIFY) {
2325                                 uflags |= UMU_NOTIFY;
2326                                 notify = 1;
2327                         }
2328                         cp->cache_uflags &= ~uflags;
2329                 }
2330                 if (notify)
2331                         (void) cond_broadcast(&umem_update_cv);
2332         }
2333 }
2334
2335 #ifndef UMEM_STANDALONE
2336 static void
2337 umem_st_update(void)
2338 {
2339         ASSERT(MUTEX_HELD(&umem_update_lock));
2340         ASSERT(umem_update_thr == 0 && umem_st_update_thr == 0);
2341
2342         umem_st_update_thr = thr_self();
2343
2344         (void) mutex_unlock(&umem_update_lock);
2345
2346         vmem_update(NULL);
2347         umem_cache_applyall(umem_cache_update);
2348
2349         (void) mutex_lock(&umem_update_lock);
2350
2351         umem_process_updates(); /* does all of the requested work */
2352
2353         umem_reap_next = gethrtime() +
2354             (hrtime_t)umem_reap_interval * NANOSEC;
2355
2356         umem_reaping = UMEM_REAP_DONE;
2357
2358         umem_st_update_thr = 0;
2359 }
2360 #endif
2361
2362 /*
2363  * Reclaim all unused memory from all caches.  Called from vmem when memory
2364  * gets tight.  Must be called with no locks held.
2365  *
2366  * This just requests a reap on all caches, and notifies the update thread.
2367  */
2368 void
2369 umem_reap(void)
2370 {
2371 #ifndef UMEM_STANDALONE
2372         extern int __nthreads(void);
2373 #endif
2374
2375         if (umem_ready != UMEM_READY || umem_reaping != UMEM_REAP_DONE ||
2376             gethrtime() < umem_reap_next)
2377                 return;
2378
2379         (void) mutex_lock(&umem_update_lock);
2380
2381         if (umem_reaping != UMEM_REAP_DONE || gethrtime() < umem_reap_next) {
2382                 (void) mutex_unlock(&umem_update_lock);
2383                 return;
2384         }
2385
2386         umem_reaping = UMEM_REAP_ADDING;        /* lock out other reaps */
2387
2388         (void) mutex_unlock(&umem_update_lock);
2389
2390         umem_updateall(UMU_REAP);
2391
2392         (void) mutex_lock(&umem_update_lock);
2393
2394         umem_reaping = UMEM_REAP_ACTIVE;
2395
2396         /* Standalone is single-threaded */
2397 #ifndef UMEM_STANDALONE
2398         if (umem_update_thr == 0) {
2399                 /*
2400                  * The update thread does not exist.  If the process is
2401                  * multi-threaded, create it.  If not, or the creation fails,
2402                  * do the update processing inline.
2403                  */
2404                 ASSERT(umem_st_update_thr == 0);
2405
2406                 if (__nthreads() <= 1 || umem_create_update_thread() == 0)
2407                         umem_st_update();
2408         }
2409
2410         (void) cond_broadcast(&umem_update_cv); /* wake up the update thread */
2411 #endif
2412
2413         (void) mutex_unlock(&umem_update_lock);
2414 }
2415
2416 umem_cache_t *
2417 umem_cache_create(
2418         char *name,             /* descriptive name for this cache */
2419         size_t bufsize,         /* size of the objects it manages */
2420         size_t align,           /* required object alignment */
2421         umem_constructor_t *constructor, /* object constructor */
2422         umem_destructor_t *destructor, /* object destructor */
2423         umem_reclaim_t *reclaim, /* memory reclaim callback */
2424         void *private,          /* pass-thru arg for constr/destr/reclaim */
2425         vmem_t *vmp,            /* vmem source for slab allocation */
2426         int cflags)             /* cache creation flags */
2427 {
2428         int cpu_seqid;
2429         size_t chunksize;
2430         umem_cache_t *cp, *cnext, *cprev;
2431         umem_magtype_t *mtp;
2432         size_t csize;
2433         size_t phase;
2434
2435         /*
2436          * The init thread is allowed to create internal and quantum caches.
2437          *
2438          * Other threads must wait until until initialization is complete.
2439          */
2440         if (umem_init_thr == thr_self())
2441                 ASSERT((cflags & (UMC_INTERNAL | UMC_QCACHE)) != 0);
2442         else {
2443                 ASSERT(!(cflags & UMC_INTERNAL));
2444                 if (umem_ready != UMEM_READY && umem_init() == 0) {
2445                         errno = EAGAIN;
2446                         return (NULL);
2447                 }
2448         }
2449
2450         csize = UMEM_CACHE_SIZE(umem_max_ncpus);
2451         phase = P2NPHASE(csize, UMEM_CPU_CACHE_SIZE);
2452
2453         if (vmp == NULL)
2454                 vmp = umem_default_arena;
2455
2456         ASSERT(P2PHASE(phase, UMEM_ALIGN) == 0);
2457
2458         /*
2459          * Check that the arguments are reasonable
2460          */
2461         if ((align & (align - 1)) != 0 || align > vmp->vm_quantum ||
2462             ((cflags & UMC_NOHASH) && (cflags & UMC_NOTOUCH)) ||
2463             name == NULL || bufsize == 0) {
2464                 errno = EINVAL;
2465                 return (NULL);
2466         }
2467
2468         /*
2469          * If align == 0, we set it to the minimum required alignment.
2470          *
2471          * If align < UMEM_ALIGN, we round it up to UMEM_ALIGN, unless
2472          * UMC_NOTOUCH was passed.
2473          */
2474         if (align == 0) {
2475                 if (P2ROUNDUP(bufsize, UMEM_ALIGN) >= UMEM_SECOND_ALIGN)
2476                         align = UMEM_SECOND_ALIGN;
2477                 else
2478                         align = UMEM_ALIGN;
2479         } else if (align < UMEM_ALIGN && (cflags & UMC_NOTOUCH) == 0)
2480                 align = UMEM_ALIGN;
2481
2482
2483         /*
2484          * Get a umem_cache structure.  We arrange that cp->cache_cpu[]
2485          * is aligned on a UMEM_CPU_CACHE_SIZE boundary to prevent
2486          * false sharing of per-CPU data.
2487          */
2488         cp = vmem_xalloc(umem_cache_arena, csize, UMEM_CPU_CACHE_SIZE, phase,
2489             0, NULL, NULL, VM_NOSLEEP);
2490
2491         if (cp == NULL) {
2492                 errno = EAGAIN;
2493                 return (NULL);
2494         }
2495
2496         bzero(cp, csize);
2497
2498         (void) mutex_lock(&umem_flags_lock);
2499         if (umem_flags & UMF_RANDOMIZE)
2500                 umem_flags = (((umem_flags | ~UMF_RANDOM) + 1) & UMF_RANDOM) |
2501                     UMF_RANDOMIZE;
2502         cp->cache_flags = umem_flags | (cflags & UMF_DEBUG);
2503         (void) mutex_unlock(&umem_flags_lock);
2504
2505         /*
2506          * Make sure all the various flags are reasonable.
2507          */
2508         if (cp->cache_flags & UMF_LITE) {
2509                 if (bufsize >= umem_lite_minsize &&
2510                     align <= umem_lite_maxalign &&
2511                     P2PHASE(bufsize, umem_lite_maxalign) != 0) {
2512                         cp->cache_flags |= UMF_BUFTAG;
2513                         cp->cache_flags &= ~(UMF_AUDIT | UMF_FIREWALL);
2514                 } else {
2515                         cp->cache_flags &= ~UMF_DEBUG;
2516                 }
2517         }
2518
2519         if ((cflags & UMC_QCACHE) && (cp->cache_flags & UMF_AUDIT))
2520                 cp->cache_flags |= UMF_NOMAGAZINE;
2521
2522         if (cflags & UMC_NODEBUG)
2523                 cp->cache_flags &= ~UMF_DEBUG;
2524
2525         if (cflags & UMC_NOTOUCH)
2526                 cp->cache_flags &= ~UMF_TOUCH;
2527
2528         if (cflags & UMC_NOHASH)
2529                 cp->cache_flags &= ~(UMF_AUDIT | UMF_FIREWALL);
2530
2531         if (cflags & UMC_NOMAGAZINE)
2532                 cp->cache_flags |= UMF_NOMAGAZINE;
2533
2534         if ((cp->cache_flags & UMF_AUDIT) && !(cflags & UMC_NOTOUCH))
2535                 cp->cache_flags |= UMF_REDZONE;
2536
2537         if ((cp->cache_flags & UMF_BUFTAG) && bufsize >= umem_minfirewall &&
2538             !(cp->cache_flags & UMF_LITE) && !(cflags & UMC_NOHASH))
2539                 cp->cache_flags |= UMF_FIREWALL;
2540
2541         if (vmp != umem_default_arena || umem_firewall_arena == NULL)
2542                 cp->cache_flags &= ~UMF_FIREWALL;
2543
2544         if (cp->cache_flags & UMF_FIREWALL) {
2545                 cp->cache_flags &= ~UMF_BUFTAG;
2546                 cp->cache_flags |= UMF_NOMAGAZINE;
2547                 ASSERT(vmp == umem_default_arena);
2548                 vmp = umem_firewall_arena;
2549         }
2550
2551         /*
2552          * Set cache properties.
2553          */
2554         (void) strncpy(cp->cache_name, name, sizeof (cp->cache_name) - 1);
2555         cp->cache_bufsize = bufsize;
2556         cp->cache_align = align;
2557         cp->cache_constructor = constructor;
2558         cp->cache_destructor = destructor;
2559         cp->cache_reclaim = reclaim;
2560         cp->cache_private = private;
2561         cp->cache_arena = vmp;
2562         cp->cache_cflags = cflags;
2563         cp->cache_cpu_mask = umem_cpu_mask;
2564
2565         /*
2566          * Determine the chunk size.
2567          */
2568         chunksize = bufsize;
2569
2570         if (align >= UMEM_ALIGN) {
2571                 chunksize = P2ROUNDUP(chunksize, UMEM_ALIGN);
2572                 cp->cache_bufctl = chunksize - UMEM_ALIGN;
2573         }
2574
2575         if (cp->cache_flags & UMF_BUFTAG) {
2576                 cp->cache_bufctl = chunksize;
2577                 cp->cache_buftag = chunksize;
2578                 chunksize += sizeof (umem_buftag_t);
2579         }
2580
2581         if (cp->cache_flags & UMF_DEADBEEF) {
2582                 cp->cache_verify = MIN(cp->cache_buftag, umem_maxverify);
2583                 if (cp->cache_flags & UMF_LITE)
2584                         cp->cache_verify = MIN(cp->cache_verify, UMEM_ALIGN);
2585         }
2586
2587         cp->cache_contents = MIN(cp->cache_bufctl, umem_content_maxsave);
2588
2589         cp->cache_chunksize = chunksize = P2ROUNDUP(chunksize, align);
2590
2591         if (chunksize < bufsize) {
2592                 errno = ENOMEM;
2593                 goto fail;
2594         }
2595
2596         /*
2597          * Now that we know the chunk size, determine the optimal slab size.
2598          */
2599         if (vmp == umem_firewall_arena) {
2600                 cp->cache_slabsize = P2ROUNDUP(chunksize, vmp->vm_quantum);
2601                 cp->cache_mincolor = cp->cache_slabsize - chunksize;
2602                 cp->cache_maxcolor = cp->cache_mincolor;
2603                 cp->cache_flags |= UMF_HASH;
2604                 ASSERT(!(cp->cache_flags & UMF_BUFTAG));
2605         } else if ((cflags & UMC_NOHASH) || (!(cflags & UMC_NOTOUCH) &&
2606             !(cp->cache_flags & UMF_AUDIT) &&
2607             chunksize < vmp->vm_quantum / UMEM_VOID_FRACTION)) {
2608                 cp->cache_slabsize = vmp->vm_quantum;
2609                 cp->cache_mincolor = 0;
2610                 cp->cache_maxcolor =
2611                     (cp->cache_slabsize - sizeof (umem_slab_t)) % chunksize;
2612
2613                 if (chunksize + sizeof (umem_slab_t) > cp->cache_slabsize) {
2614                         errno = EINVAL;
2615                         goto fail;
2616                 }
2617                 ASSERT(!(cp->cache_flags & UMF_AUDIT));
2618         } else {
2619                 size_t chunks, bestfit, waste, slabsize;
2620                 size_t minwaste = LONG_MAX;
2621
2622                 for (chunks = 1; chunks <= UMEM_VOID_FRACTION; chunks++) {
2623                         slabsize = P2ROUNDUP(chunksize * chunks,
2624                             vmp->vm_quantum);
2625                         /*
2626                          * check for overflow
2627                          */
2628                         if ((slabsize / chunks) < chunksize) {
2629                                 errno = ENOMEM;
2630                                 goto fail;
2631                         }
2632                         chunks = slabsize / chunksize;
2633                         waste = (slabsize % chunksize) / chunks;
2634                         if (waste < minwaste) {
2635                                 minwaste = waste;
2636                                 bestfit = slabsize;
2637                         }
2638                 }
2639                 if (cflags & UMC_QCACHE)
2640                         bestfit = MAX(1 << highbit(3 * vmp->vm_qcache_max), 64);
2641                 cp->cache_slabsize = bestfit;
2642                 cp->cache_mincolor = 0;
2643                 cp->cache_maxcolor = bestfit % chunksize;
2644                 cp->cache_flags |= UMF_HASH;
2645         }
2646
2647         if (cp->cache_flags & UMF_HASH) {
2648                 ASSERT(!(cflags & UMC_NOHASH));
2649                 cp->cache_bufctl_cache = (cp->cache_flags & UMF_AUDIT) ?
2650                     umem_bufctl_audit_cache : umem_bufctl_cache;
2651         }
2652
2653         if (cp->cache_maxcolor >= vmp->vm_quantum)
2654                 cp->cache_maxcolor = vmp->vm_quantum - 1;
2655
2656         cp->cache_color = cp->cache_mincolor;
2657
2658         /*
2659          * Initialize the rest of the slab layer.
2660          */
2661         (void) mutex_init(&cp->cache_lock, USYNC_THREAD, NULL);
2662
2663         cp->cache_freelist = &cp->cache_nullslab;
2664         cp->cache_nullslab.slab_cache = cp;
2665         cp->cache_nullslab.slab_refcnt = -1;
2666         cp->cache_nullslab.slab_next = &cp->cache_nullslab;
2667         cp->cache_nullslab.slab_prev = &cp->cache_nullslab;
2668
2669         if (cp->cache_flags & UMF_HASH) {
2670                 cp->cache_hash_table = vmem_alloc(umem_hash_arena,
2671                     UMEM_HASH_INITIAL * sizeof (void *), VM_NOSLEEP);
2672                 if (cp->cache_hash_table == NULL) {
2673                         errno = EAGAIN;
2674                         goto fail_lock;
2675                 }
2676                 bzero(cp->cache_hash_table,
2677                     UMEM_HASH_INITIAL * sizeof (void *));
2678                 cp->cache_hash_mask = UMEM_HASH_INITIAL - 1;
2679                 cp->cache_hash_shift = highbit((ulong_t)chunksize) - 1;
2680         }
2681
2682         /*
2683          * Initialize the depot.
2684          */
2685         (void) mutex_init(&cp->cache_depot_lock, USYNC_THREAD, NULL);
2686
2687         for (mtp = umem_magtype; chunksize <= mtp->mt_minbuf; mtp++)
2688                 continue;
2689
2690         cp->cache_magtype = mtp;
2691
2692         /*
2693          * Initialize the CPU layer.
2694          */
2695         for (cpu_seqid = 0; cpu_seqid < umem_max_ncpus; cpu_seqid++) {
2696                 umem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
2697                 (void) mutex_init(&ccp->cc_lock, USYNC_THREAD, NULL);
2698                 ccp->cc_flags = cp->cache_flags;
2699                 ccp->cc_rounds = -1;
2700                 ccp->cc_prounds = -1;
2701         }
2702
2703         /*
2704          * Add the cache to the global list.  This makes it visible
2705          * to umem_update(), so the cache must be ready for business.
2706          */
2707         (void) mutex_lock(&umem_cache_lock);
2708         cp->cache_next = cnext = &umem_null_cache;
2709         cp->cache_prev = cprev = umem_null_cache.cache_prev;
2710         cnext->cache_prev = cp;
2711         cprev->cache_next = cp;
2712         (void) mutex_unlock(&umem_cache_lock);
2713
2714         if (umem_ready == UMEM_READY)
2715                 umem_cache_magazine_enable(cp);
2716
2717         return (cp);
2718
2719 fail_lock:
2720         (void) mutex_destroy(&cp->cache_lock);
2721 fail:
2722         vmem_xfree(umem_cache_arena, cp, csize);
2723         return (NULL);
2724 }
2725
2726 void
2727 umem_cache_destroy(umem_cache_t *cp)
2728 {
2729         int cpu_seqid;
2730
2731         /*
2732          * Remove the cache from the global cache list so that no new updates
2733          * will be scheduled on its behalf, wait for any pending tasks to
2734          * complete, purge the cache, and then destroy it.
2735          */
2736         (void) mutex_lock(&umem_cache_lock);
2737         cp->cache_prev->cache_next = cp->cache_next;
2738         cp->cache_next->cache_prev = cp->cache_prev;
2739         cp->cache_prev = cp->cache_next = NULL;
2740         (void) mutex_unlock(&umem_cache_lock);
2741
2742         umem_remove_updates(cp);
2743
2744         umem_cache_magazine_purge(cp);
2745
2746         (void) mutex_lock(&cp->cache_lock);
2747         if (cp->cache_buftotal != 0)
2748                 log_message("umem_cache_destroy: '%s' (%p) not empty\n",
2749                     cp->cache_name, (void *)cp);
2750         cp->cache_reclaim = NULL;
2751         /*
2752          * The cache is now dead.  There should be no further activity.
2753          * We enforce this by setting land mines in the constructor and
2754          * destructor routines that induce a segmentation fault if invoked.
2755          */
2756         cp->cache_constructor = (umem_constructor_t *)1;
2757         cp->cache_destructor = (umem_destructor_t *)2;
2758         (void) mutex_unlock(&cp->cache_lock);
2759
2760         if (cp->cache_hash_table != NULL)
2761                 vmem_free(umem_hash_arena, cp->cache_hash_table,
2762                     (cp->cache_hash_mask + 1) * sizeof (void *));
2763
2764         for (cpu_seqid = 0; cpu_seqid < umem_max_ncpus; cpu_seqid++)
2765                 (void) mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock);
2766
2767         (void) mutex_destroy(&cp->cache_depot_lock);
2768         (void) mutex_destroy(&cp->cache_lock);
2769
2770         vmem_free(umem_cache_arena, cp, UMEM_CACHE_SIZE(umem_max_ncpus));
2771 }
2772
2773 static int
2774 umem_cache_init(void)
2775 {
2776         int i;
2777         size_t size, max_size;
2778         umem_cache_t *cp;
2779         umem_magtype_t *mtp;
2780         char name[UMEM_CACHE_NAMELEN + 1];
2781         umem_cache_t *umem_alloc_caches[NUM_ALLOC_SIZES];
2782
2783         for (i = 0; i < sizeof (umem_magtype) / sizeof (*mtp); i++) {
2784                 mtp = &umem_magtype[i];
2785                 (void) snprintf(name, sizeof (name), "umem_magazine_%d",
2786                     mtp->mt_magsize);
2787                 mtp->mt_cache = umem_cache_create(name,
2788                     (mtp->mt_magsize + 1) * sizeof (void *),
2789                     mtp->mt_align, NULL, NULL, NULL, NULL,
2790                     umem_internal_arena, UMC_NOHASH | UMC_INTERNAL);
2791                 if (mtp->mt_cache == NULL)
2792                         return (0);
2793         }
2794
2795         umem_slab_cache = umem_cache_create("umem_slab_cache",
2796             sizeof (umem_slab_t), 0, NULL, NULL, NULL, NULL,
2797             umem_internal_arena, UMC_NOHASH | UMC_INTERNAL);
2798
2799         if (umem_slab_cache == NULL)
2800                 return (0);
2801
2802         umem_bufctl_cache = umem_cache_create("umem_bufctl_cache",
2803             sizeof (umem_bufctl_t), 0, NULL, NULL, NULL, NULL,
2804             umem_internal_arena, UMC_NOHASH | UMC_INTERNAL);
2805
2806         if (umem_bufctl_cache == NULL)
2807                 return (0);
2808
2809         /*
2810          * The size of the umem_bufctl_audit structure depends upon
2811          * umem_stack_depth.   See umem_impl.h for details on the size
2812          * restrictions.
2813          */
2814
2815         size = UMEM_BUFCTL_AUDIT_SIZE_DEPTH(umem_stack_depth);
2816         max_size = UMEM_BUFCTL_AUDIT_MAX_SIZE;
2817
2818         if (size > max_size) {                  /* too large -- truncate */
2819                 int max_frames = UMEM_MAX_STACK_DEPTH;
2820
2821                 ASSERT(UMEM_BUFCTL_AUDIT_SIZE_DEPTH(max_frames) <= max_size);
2822
2823                 umem_stack_depth = max_frames;
2824                 size = UMEM_BUFCTL_AUDIT_SIZE_DEPTH(umem_stack_depth);
2825         }
2826
2827         umem_bufctl_audit_cache = umem_cache_create("umem_bufctl_audit_cache",
2828             size, 0, NULL, NULL, NULL, NULL, umem_internal_arena,
2829             UMC_NOHASH | UMC_INTERNAL);
2830
2831         if (umem_bufctl_audit_cache == NULL)
2832                 return (0);
2833
2834         if (vmem_backend & VMEM_BACKEND_MMAP)
2835                 umem_va_arena = vmem_create("umem_va",
2836                     NULL, 0, pagesize,
2837                     vmem_alloc, vmem_free, heap_arena,
2838                     8 * pagesize, VM_NOSLEEP);
2839         else
2840                 umem_va_arena = heap_arena;
2841
2842         if (umem_va_arena == NULL)
2843                 return (0);
2844
2845         umem_default_arena = vmem_create("umem_default",
2846             NULL, 0, pagesize,
2847             heap_alloc, heap_free, umem_va_arena,
2848             0, VM_NOSLEEP);
2849
2850         if (umem_default_arena == NULL)
2851                 return (0);
2852
2853         /*
2854          * make sure the umem_alloc table initializer is correct
2855          */
2856         i = sizeof (umem_alloc_table) / sizeof (*umem_alloc_table);
2857         ASSERT(umem_alloc_table[i - 1] == &umem_null_cache);
2858
2859         /*
2860          * Create the default caches to back umem_alloc()
2861          */
2862         for (i = 0; i < NUM_ALLOC_SIZES; i++) {
2863                 size_t cache_size = umem_alloc_sizes[i];
2864                 size_t align = 0;
2865                 /*
2866                  * If they allocate a multiple of the coherency granularity,
2867                  * they get a coherency-granularity-aligned address.
2868                  */
2869                 if (IS_P2ALIGNED(cache_size, 64))
2870                         align = 64;
2871                 if (IS_P2ALIGNED(cache_size, pagesize))
2872                         align = pagesize;
2873                 (void) snprintf(name, sizeof (name), "umem_alloc_%lu",
2874                     (long)cache_size);
2875
2876                 cp = umem_cache_create(name, cache_size, align,
2877                     NULL, NULL, NULL, NULL, NULL, UMC_INTERNAL);
2878                 if (cp == NULL)
2879                         return (0);
2880
2881                 umem_alloc_caches[i] = cp;
2882         }
2883
2884         /*
2885          * Initialization cannot fail at this point.  Make the caches
2886          * visible to umem_alloc() and friends.
2887          */
2888         size = UMEM_ALIGN;
2889         for (i = 0; i < NUM_ALLOC_SIZES; i++) {
2890                 size_t cache_size = umem_alloc_sizes[i];
2891
2892                 cp = umem_alloc_caches[i];
2893
2894                 while (size <= cache_size) {
2895                         umem_alloc_table[(size - 1) >> UMEM_ALIGN_SHIFT] = cp;
2896                         size += UMEM_ALIGN;
2897                 }
2898         }
2899         return (1);
2900 }
2901
2902 /*
2903  * umem_startup() is called early on, and must be called explicitly if we're
2904  * the standalone version.
2905  */
2906 static void
2907 umem_startup() __attribute__((constructor));
2908
2909 void
2910 umem_startup()
2911 {
2912         caddr_t start = NULL;
2913         size_t len = 0;
2914         size_t pagesize = 0;
2915
2916 #ifdef UMEM_STANDALONE
2917         int idx;
2918         /* Standalone doesn't fork */
2919 #else
2920         umem_forkhandler_init(); /* register the fork handler */
2921 #endif
2922
2923 #ifdef __lint
2924         /* make lint happy */
2925         minstack = maxstack;
2926 #endif
2927
2928 #ifdef UMEM_STANDALONE
2929         umem_ready = UMEM_READY_STARTUP;
2930         umem_init_env_ready = 0;
2931
2932         umem_min_stack = minstack;
2933         umem_max_stack = maxstack;
2934
2935         nofail_callback = NULL;
2936         umem_slab_cache = NULL;
2937         umem_bufctl_cache = NULL;
2938         umem_bufctl_audit_cache = NULL;
2939         heap_arena = NULL;
2940         heap_alloc = NULL;
2941         heap_free = NULL;
2942         umem_internal_arena = NULL;
2943         umem_cache_arena = NULL;
2944         umem_hash_arena = NULL;
2945         umem_log_arena = NULL;
2946         umem_oversize_arena = NULL;
2947         umem_va_arena = NULL;
2948         umem_default_arena = NULL;
2949         umem_firewall_va_arena = NULL;
2950         umem_firewall_arena = NULL;
2951         umem_memalign_arena = NULL;
2952         umem_transaction_log = NULL;
2953         umem_content_log = NULL;
2954         umem_failure_log = NULL;
2955         umem_slab_log = NULL;
2956         umem_cpu_mask = 0;
2957
2958         umem_cpus = &umem_startup_cpu;
2959         umem_startup_cpu.cpu_cache_offset = UMEM_CACHE_SIZE(0);
2960         umem_startup_cpu.cpu_number = 0;
2961
2962         bcopy(&umem_null_cache_template, &umem_null_cache,
2963             sizeof (umem_cache_t));
2964
2965         for (idx = 0; idx < (UMEM_MAXBUF >> UMEM_ALIGN_SHIFT); idx++)
2966                 umem_alloc_table[idx] = &umem_null_cache;
2967 #endif
2968
2969         /*
2970          * Perform initialization specific to the way we've been compiled
2971          * (library or standalone)
2972          */
2973         umem_type_init(start, len, pagesize);
2974
2975         vmem_startup();
2976 }
2977
2978 int
2979 umem_init(void)
2980 {
2981         size_t maxverify, minfirewall;
2982         size_t size;
2983         int idx;
2984         umem_cpu_t *new_cpus;
2985
2986         vmem_t *memalign_arena, *oversize_arena;
2987
2988         if (thr_self() != umem_init_thr) {
2989                 /*
2990                  * The usual case -- non-recursive invocation of umem_init().
2991                  */
2992                 (void) mutex_lock(&umem_init_lock);
2993                 if (umem_ready != UMEM_READY_STARTUP) {
2994                         /*
2995                          * someone else beat us to initializing umem.  Wait
2996                          * for them to complete, then return.
2997                          */
2998                         while (umem_ready == UMEM_READY_INITING)
2999                                 (void) _cond_wait(&umem_init_cv,
3000                                     &umem_init_lock);
3001                         ASSERT(umem_ready == UMEM_READY ||
3002                             umem_ready == UMEM_READY_INIT_FAILED);
3003                         (void) mutex_unlock(&umem_init_lock);
3004                         return (umem_ready == UMEM_READY);
3005                 }
3006
3007                 ASSERT(umem_ready == UMEM_READY_STARTUP);
3008                 ASSERT(umem_init_env_ready == 0);
3009
3010                 umem_ready = UMEM_READY_INITING;
3011                 umem_init_thr = thr_self();
3012
3013                 (void) mutex_unlock(&umem_init_lock);
3014                 umem_setup_envvars(0);          /* can recurse -- see below */
3015                 if (umem_init_env_ready) {
3016                         /*
3017                          * initialization was completed already
3018                          */
3019                         ASSERT(umem_ready == UMEM_READY ||
3020                             umem_ready == UMEM_READY_INIT_FAILED);
3021                         ASSERT(umem_init_thr == 0);
3022                         return (umem_ready == UMEM_READY);
3023                 }
3024         } else if (!umem_init_env_ready) {
3025                 /*
3026                  * The umem_setup_envvars() call (above) makes calls into
3027                  * the dynamic linker and directly into user-supplied code.
3028                  * Since we cannot know what that code will do, we could be
3029                  * recursively invoked (by, say, a malloc() call in the code
3030                  * itself, or in a (C++) _init section it causes to be fired).
3031                  *
3032                  * This code is where we end up if such recursion occurs.  We
3033                  * first clean up any partial results in the envvar code, then
3034                  * proceed to finish initialization processing in the recursive
3035                  * call.  The original call will notice this, and return
3036                  * immediately.
3037                  */
3038                 umem_setup_envvars(1);          /* clean up any partial state */
3039         } else {
3040                 umem_panic(
3041                     "recursive allocation while initializing umem\n");
3042         }
3043         umem_init_env_ready = 1;
3044
3045         /*
3046          * From this point until we finish, recursion into umem_init() will
3047          * cause a umem_panic().
3048          */
3049         maxverify = minfirewall = ULONG_MAX;
3050
3051         /* LINTED constant condition */
3052         if (sizeof (umem_cpu_cache_t) != UMEM_CPU_CACHE_SIZE) {
3053                 umem_panic("sizeof (umem_cpu_cache_t) = %d, should be %d\n",
3054                     sizeof (umem_cpu_cache_t), UMEM_CPU_CACHE_SIZE);
3055         }
3056
3057         umem_max_ncpus = umem_get_max_ncpus();
3058
3059         /*
3060          * load tunables from environment
3061          */
3062         umem_process_envvars();
3063
3064         if (issetugid())
3065                 umem_mtbf = 0;
3066
3067         /*
3068          * set up vmem
3069          */
3070         if (!(umem_flags & UMF_AUDIT))
3071                 vmem_no_debug();
3072
3073         heap_arena = vmem_heap_arena(&heap_alloc, &heap_free);
3074
3075         pagesize = heap_arena->vm_quantum;
3076
3077         umem_internal_arena = vmem_create("umem_internal", NULL, 0, pagesize,
3078             heap_alloc, heap_free, heap_arena, 0, VM_NOSLEEP);
3079
3080         umem_default_arena = umem_internal_arena;
3081
3082         if (umem_internal_arena == NULL)
3083                 goto fail;
3084
3085         umem_cache_arena = vmem_create("umem_cache", NULL, 0, UMEM_ALIGN,
3086             vmem_alloc, vmem_free, umem_internal_arena, 0, VM_NOSLEEP);
3087
3088         umem_hash_arena = vmem_create("umem_hash", NULL, 0, UMEM_ALIGN,
3089             vmem_alloc, vmem_free, umem_internal_arena, 0, VM_NOSLEEP);
3090
3091         umem_log_arena = vmem_create("umem_log", NULL, 0, UMEM_ALIGN,
3092             heap_alloc, heap_free, heap_arena, 0, VM_NOSLEEP);
3093
3094         umem_firewall_va_arena = vmem_create("umem_firewall_va",
3095             NULL, 0, pagesize,
3096             umem_firewall_va_alloc, umem_firewall_va_free, heap_arena,
3097             0, VM_NOSLEEP);
3098
3099         if (umem_cache_arena == NULL || umem_hash_arena == NULL ||
3100             umem_log_arena == NULL || umem_firewall_va_arena == NULL)
3101                 goto fail;
3102
3103         umem_firewall_arena = vmem_create("umem_firewall", NULL, 0, pagesize,
3104             heap_alloc, heap_free, umem_firewall_va_arena, 0,
3105             VM_NOSLEEP);
3106
3107         if (umem_firewall_arena == NULL)
3108                 goto fail;
3109
3110         oversize_arena = vmem_create("umem_oversize", NULL, 0, pagesize,
3111             heap_alloc, heap_free, minfirewall < ULONG_MAX ?
3112             umem_firewall_va_arena : heap_arena, 0, VM_NOSLEEP);
3113
3114         memalign_arena = vmem_create("umem_memalign", NULL, 0, UMEM_ALIGN,
3115             heap_alloc, heap_free, minfirewall < ULONG_MAX ?
3116             umem_firewall_va_arena : heap_arena, 0, VM_NOSLEEP);
3117
3118         if (oversize_arena == NULL || memalign_arena == NULL)
3119                 goto fail;
3120
3121         if (umem_max_ncpus > CPUHINT_MAX())
3122                 umem_max_ncpus = CPUHINT_MAX();
3123
3124         while ((umem_max_ncpus & (umem_max_ncpus - 1)) != 0)
3125                 umem_max_ncpus++;
3126
3127         if (umem_max_ncpus == 0)
3128                 umem_max_ncpus = 1;
3129
3130         size = umem_max_ncpus * sizeof (umem_cpu_t);
3131         new_cpus = vmem_alloc(umem_internal_arena, size, VM_NOSLEEP);
3132         if (new_cpus == NULL)
3133                 goto fail;
3134
3135         bzero(new_cpus, size);
3136         for (idx = 0; idx < umem_max_ncpus; idx++) {
3137                 new_cpus[idx].cpu_number = idx;
3138                 new_cpus[idx].cpu_cache_offset = UMEM_CACHE_SIZE(idx);
3139         }
3140         umem_cpus = new_cpus;
3141         umem_cpu_mask = (umem_max_ncpus - 1);
3142
3143         if (umem_maxverify == 0)
3144                 umem_maxverify = maxverify;
3145
3146         if (umem_minfirewall == 0)
3147                 umem_minfirewall = minfirewall;
3148
3149         /*
3150          * Set up updating and reaping
3151          */
3152         umem_reap_next = gethrtime() + NANOSEC;
3153
3154 #ifndef UMEM_STANDALONE
3155         (void) gettimeofday(&umem_update_next, NULL);
3156 #endif
3157
3158         /*
3159          * Set up logging -- failure here is okay, since it will just disable
3160          * the logs
3161          */
3162         if (umem_logging) {
3163                 umem_transaction_log = umem_log_init(umem_transaction_log_size);
3164                 umem_content_log = umem_log_init(umem_content_log_size);
3165                 umem_failure_log = umem_log_init(umem_failure_log_size);
3166                 umem_slab_log = umem_log_init(umem_slab_log_size);
3167         }
3168
3169         /*
3170          * Set up caches -- if successful, initialization cannot fail, since
3171          * allocations from other threads can now succeed.
3172          */
3173         if (umem_cache_init() == 0) {
3174                 log_message("unable to create initial caches\n");
3175                 goto fail;
3176         }
3177         umem_oversize_arena = oversize_arena;
3178         umem_memalign_arena = memalign_arena;
3179
3180         umem_cache_applyall(umem_cache_magazine_enable);
3181
3182         /*
3183          * initialization done, ready to go
3184          */
3185         (void) mutex_lock(&umem_init_lock);
3186         umem_ready = UMEM_READY;
3187         umem_init_thr = 0;
3188         (void) cond_broadcast(&umem_init_cv);
3189         (void) mutex_unlock(&umem_init_lock);
3190         return (1);
3191
3192 fail:
3193         log_message("umem initialization failed\n");
3194
3195         (void) mutex_lock(&umem_init_lock);
3196         umem_ready = UMEM_READY_INIT_FAILED;
3197         umem_init_thr = 0;
3198         (void) cond_broadcast(&umem_init_cv);
3199         (void) mutex_unlock(&umem_init_lock);
3200         return (0);
3201 }
3202
3203 size_t
3204 umem_cache_get_bufsize(umem_cache_t *cache)
3205 {
3206         return cache->cache_bufsize;
3207 }
3208