lib/libc/stdlib/dmalloc.c

   1 /*
   2  * DMALLOC.C    - Dillon's malloc
   3  *
   4  * Copyright (c) 2011,2017 The DragonFly Project. All rights reserved.
   5  *
   6  * This code is derived from software contributed to The DragonFly Project
   7  * by Matthew Dillon <dillon@backplane.com>.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  *
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in
  17  *    the documentation and/or other materials provided with the
  18  *    distribution.
  19  * 3. Neither the name of The DragonFly Project nor the names of its
  20  *    contributors may be used to endorse or promote products derived
  21  *    from this software without specific, prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  */
  36 /*
  37  * This module implements a modified slab allocator as a drop-in replacement
  38  * for the libc malloc().  The slab algorithm has been adjusted to support
  39  * dynamic sizing of slabs which effectively allows slabs to be used for
  40  * allocations of any size.  Because of this we neither have a small-block
  41  * allocator or a big-block allocator and the code paths are simplified.
  42  *
  43  * To support dynamic slab sizing available user virtual memory is broken
  44  * down into ~1024 regions.  Each region has fixed slab size whos value is
  45  * set when the region is opened up for use.  The free() path simply applies
  46  * a mask based on the region to the pointer to acquire the base of the
  47  * governing slab structure.
  48  *
  49  * Regions[NREGIONS]    (1024)
  50  *
  51  * Slab management and locking is done on a per-zone basis.
  52  *
  53  *      Alloc Size      Chunking        Number of zones
  54  *      0-127           8               16
  55  *      128-255         16              8
  56  *      256-511         32              8
  57  *      512-1023        64              8
  58  *      1024-2047       128             8
  59  *      2048-4095       256             8
  60  *      4096-8191       512             8
  61  *      8192-16383      1024            8
  62  *      16384-32767     2048            8
  63  *      32768-65535     4096            8
  64  *      ... continues forever ...       4 zones
  65  *
  66  *      For a 2^63 memory space each doubling >= 64K is broken down into
  67  *      4 chunking zones, so we support 88 + (48 * 4) = 280 zones.
  68  *
  69  *                         API FEATURES AND SIDE EFFECTS
  70  *
  71  *    + power-of-2 sized allocations up to a page will be power-of-2 aligned.
  72  *      Above that power-of-2 sized allocations are page-aligned.  Non
  73  *      power-of-2 sized allocations are aligned the same as the chunk
  74  *      size for their zone.
  75  *    + ability to allocate arbitrarily large chunks of memory
  76  *    + realloc will reuse the passed pointer if possible, within the
  77  *      limitations of the zone chunking.
  78  *
  79  * On top of the slab allocator we also implement a 16-entry-per-thread
  80  * magazine cache for allocations <= NOMSLABSIZE.
  81  *
  82  *                              FUTURE FEATURES
  83  *
  84  *    + [better] garbage collection
  85  *    + better initial sizing.
  86  *
  87  * TUNING
  88  *
  89  * The value of the environment variable MALLOC_OPTIONS is a character string
  90  * containing various flags to tune nmalloc.  Upper case letters enabled
  91  * or increase the feature, lower case disables or decreases the feature.
  92  *
  93  * U            Enable UTRACE for all operations, observable with ktrace.
  94  *              Diasbled by default.
  95  *
  96  * Z            Zero out allocations, otherwise allocations (except for
  97  *              calloc) will contain garbage.
  98  *              Disabled by default.
  99  *
 100  * H            Pass a hint with madvise() about unused pages.
 101  *              Disabled by default.
 102  *              Not currently implemented.
 103  *
 104  * F            Disable local per-thread caching.
 105  *              Disabled by default.
 106  *
 107  * C            Increase (decrease) how much excess cache to retain.
 108  *              Set to 4 by default.
 109  */
 110
 111 /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o dmalloc.so dmalloc.c */
 112
 113 #ifndef STANDALONE_DEBUG
 114 #include "libc_private.h"
 115 #endif
 116
 117 #include <sys/param.h>
 118 #include <sys/types.h>
 119 #include <sys/mman.h>
 120 #include <sys/queue.h>
 121 #include <sys/ktrace.h>
 122 #include <stdio.h>
 123 #include <stdint.h>
 124 #include <stdlib.h>
 125 #include <stdarg.h>
 126 #include <stddef.h>
 127 #include <unistd.h>
 128 #include <string.h>
 129 #include <fcntl.h>
 130 #include <errno.h>
 131 #include <pthread.h>
 132 #include <limits.h>
 133
 134 #include <machine/atomic.h>
 135 #include <machine/cpufunc.h>
 136
 137 #ifdef STANDALONE_DEBUG
 138 void _nmalloc_thr_init(void);
 139 #else
 140 #include "spinlock.h"
 141 #include "un-namespace.h"
 142 #endif
 143
 144 #ifndef MAP_SIZEALIGN
 145 #define MAP_SIZEALIGN   0
 146 #endif
 147
 148 #if SSIZE_MAX == 0x7FFFFFFF
 149 #define ADDRBITS        32
 150 #define UVM_BITS        32      /* worst case */
 151 #else
 152 #define ADDRBITS        64
 153 #define UVM_BITS        48      /* worst case XXX */
 154 #endif
 155
 156 #if LONG_MAX == 0x7FFFFFFF
 157 #define LONG_BITS       32
 158 #define LONG_BITS_SHIFT 5
 159 #else
 160 #define LONG_BITS       64
 161 #define LONG_BITS_SHIFT 6
 162 #endif
 163
 164 #define LOCKEDPTR       ((void *)(intptr_t)-1)
 165
 166 /*
 167  * Regions[]
 168  */
 169 #define NREGIONS_BITS   10
 170 #define NREGIONS        (1 << NREGIONS_BITS)
 171 #define NREGIONS_MASK   (NREGIONS - 1)
 172 #define NREGIONS_SHIFT  (UVM_BITS - NREGIONS_BITS)
 173 #define NREGIONS_SIZE   (1LU << NREGIONS_SHIFT)
 174
 175 typedef struct region *region_t;
 176 typedef struct slglobaldata *slglobaldata_t;
 177 typedef struct slab *slab_t;
 178
 179 struct region {
 180         uintptr_t       mask;
 181         slab_t          slab;   /* conditional out of band slab */
 182 };
 183
 184 static struct region Regions[NREGIONS];
 185
 186 /*
 187  * Number of chunking zones available
 188  */
 189 #define CHUNKFACTOR     8
 190 #if ADDRBITS == 32
 191 #define NZONES          (16 + 9 * CHUNKFACTOR + 16 * CHUNKFACTOR)
 192 #else
 193 #define NZONES          (16 + 9 * CHUNKFACTOR + 48 * CHUNKFACTOR)
 194 #endif
 195
 196 static int MaxChunks[NZONES];
 197
 198 #define NDEPOTS         8               /* must be power of 2 */
 199
 200 /*
 201  * Maximum number of chunks per slab, governed by the allocation bitmap in
 202  * each slab.  The maximum is reduced for large chunk sizes.
 203  */
 204 #define MAXCHUNKS       (LONG_BITS * LONG_BITS)
 205 #define MAXCHUNKS_BITS  (LONG_BITS_SHIFT * LONG_BITS_SHIFT)
 206 #define LITSLABSIZE     (32 * 1024)
 207 #define NOMSLABSIZE     (2 * 1024 * 1024)
 208 #define BIGSLABSIZE     (128 * 1024 * 1024)
 209
 210 #define ZALLOC_SLAB_MAGIC       0x736c6162      /* magic sanity */
 211
 212 TAILQ_HEAD(slab_list, slab);
 213
 214 /*
 215  * A slab structure
 216  */
 217 struct slab {
 218         struct slab     *next;          /* slabs with available space */
 219         TAILQ_ENTRY(slab) entry;
 220         int32_t         magic;          /* magic number for sanity check */
 221         u_int           navail;         /* number of free elements available */
 222         u_int           nmax;
 223         u_int           free_bit;       /* free hint bitno */
 224         u_int           free_index;     /* free hint index */
 225         u_long          bitmap[LONG_BITS]; /* free chunks */
 226         size_t          slab_size;      /* size of entire slab */
 227         size_t          chunk_size;     /* chunk size for validation */
 228         int             zone_index;
 229         enum { UNKNOWN, AVAIL, EMPTY, FULL } state;
 230         int             flags;
 231         region_t        region;         /* related region */
 232         char            *chunks;        /* chunk base */
 233         slglobaldata_t  slgd;           /* localized to thread else NULL */
 234 };
 235
 236 /*
 237  * per-thread data + global depot
 238  *
 239  * NOTE: The magazine shortcut is only used for per-thread data.
 240  */
 241 #define NMAGSHORTCUT    16
 242
 243 struct slglobaldata {
 244         spinlock_t      lock;           /* only used by slglobaldepot */
 245         struct zoneinfo {
 246                 slab_t  avail_base;
 247                 slab_t  empty_base;
 248                 int     best_region;
 249                 int     mag_index;
 250                 int     avail_count;
 251                 int     empty_count;
 252                 void    *mag_shortcut[NMAGSHORTCUT];
 253         } zone[NZONES];
 254         struct slab_list full_zones;    /* via entry */
 255         int             masked;
 256         int             biggest_index;
 257         size_t          nslabs;
 258 };
 259
 260 #define SLAB_ZEROD              0x0001
 261
 262 /*
 263  * Misc constants.  Note that allocations that are exact multiples of
 264  * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
 265  * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
 266  */
 267 #define MIN_CHUNK_SIZE          8               /* in bytes */
 268 #define MIN_CHUNK_MASK          (MIN_CHUNK_SIZE - 1)
 269
 270 #define SAFLAG_ZERO     0x00000001
 271
 272 /*
 273  * The WEIRD_ADDR is used as known text to copy into free objects to
 274  * try to create deterministic failure cases if the data is accessed after
 275  * free.
 276  *
 277  * WARNING: A limited number of spinlocks are available, BIGXSIZE should
 278  *          not be larger then 64.
 279  */
 280 #ifdef INVARIANTS
 281 #define WEIRD_ADDR      0xdeadc0de
 282 #endif
 283
 284 /*
 285  * Thread control
 286  */
 287
 288 #define MASSERT(exp)    do { if (__predict_false(!(exp)))       \
 289                                 _mpanic("assertion: %s in %s",  \
 290                                 #exp, __func__);                \
 291                             } while (0)
 292
 293 /*
 294  * With this attribute set, do not require a function call for accessing
 295  * this variable when the code is compiled -fPIC.
 296  *
 297  * Must be empty for libc_rtld (similar to __thread)
 298  */
 299 #if defined(__LIBC_RTLD)
 300 #define TLS_ATTRIBUTE
 301 #else
 302 #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec")));
 303 #endif
 304
 305 static __thread struct slglobaldata slglobal TLS_ATTRIBUTE;
 306 static pthread_key_t thread_malloc_key;
 307 static pthread_once_t thread_malloc_once = PTHREAD_ONCE_INIT;
 308 static struct slglobaldata slglobaldepot;
 309
 310 static int opt_madvise = 0;
 311 static int opt_free = 0;
 312 static int opt_cache = 4;
 313 static int opt_utrace = 0;
 314 static int g_malloc_flags = 0;
 315 static int malloc_panic;
 316
 317 #ifdef INVARIANTS
 318 static const int32_t weirdary[16] = {
 319         WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
 320         WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
 321         WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
 322         WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR
 323 };
 324 #endif
 325
 326 static void *memalloc(size_t size, int flags);
 327 static void *memrealloc(void *ptr, size_t size);
 328 static void memfree(void *ptr, int);
 329 static int memalign(void **memptr, size_t alignment, size_t size);
 330 static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size);
 331 static void slabfree(slab_t slab);
 332 static void slabterm(slglobaldata_t slgd, slab_t slab);
 333 static void *_vmem_alloc(int ri, size_t slab_size);
 334 static void _vmem_free(void *ptr, size_t slab_size);
 335 static void _mpanic(const char *ctl, ...) __printflike(1, 2);
 336 #ifndef STANDALONE_DEBUG
 337 static void malloc_init(void) __constructor(101);
 338 #else
 339 static void malloc_init(void) __constructor(101);
 340 #endif
 341
 342
 343 struct nmalloc_utrace {
 344         void *p;
 345         size_t s;
 346         void *r;
 347 };
 348
 349 #define UTRACE(a, b, c)                                         \
 350         if (opt_utrace) {                                       \
 351                 struct nmalloc_utrace ut = {                    \
 352                         .p = (a),                               \
 353                         .s = (b),                               \
 354                         .r = (c)                                \
 355                 };                                              \
 356                 utrace(&ut, sizeof(ut));                        \
 357         }
 358
 359 #ifdef INVARIANTS
 360 /*
 361  * If enabled any memory allocated without M_ZERO is initialized to -1.
 362  */
 363 static int  use_malloc_pattern;
 364 #endif
 365
 366 static void
 367 malloc_init(void)
 368 {
 369         const char *p = NULL;
 370
 371         TAILQ_INIT(&slglobal.full_zones);
 372
 373         Regions[0].mask = -1; /* disallow activity in lowest region */
 374
 375         if (issetugid() == 0)
 376                 p = getenv("MALLOC_OPTIONS");
 377
 378         for (; p != NULL && *p != '\0'; p++) {
 379                 switch(*p) {
 380                 case 'u':
 381                         opt_utrace = 0;
 382                         break;
 383                 case 'U':
 384                         opt_utrace = 1;
 385                         break;
 386                 case 'h':
 387                         opt_madvise = 0;
 388                         break;
 389                 case 'H':
 390                         opt_madvise = 1;
 391                         break;
 392                 case 'c':
 393                         if (opt_cache > 0)
 394                                 --opt_cache;
 395                         break;
 396                 case 'C':
 397                         ++opt_cache;
 398                         break;
 399                 case 'f':
 400                         opt_free = 0;
 401                         break;
 402                 case 'F':
 403                         opt_free = 1;
 404                         break;
 405                 case 'z':
 406                         g_malloc_flags = 0;
 407                         break;
 408                 case 'Z':
 409                         g_malloc_flags = SAFLAG_ZERO;
 410                         break;
 411                 default:
 412                         break;
 413                 }
 414         }
 415
 416         UTRACE((void *) -1, 0, NULL);
 417 }
 418
 419 /*
 420  * We have to install a handler for nmalloc thread teardowns when
 421  * the thread is created.  We cannot delay this because destructors in
 422  * sophisticated userland programs can call malloc() for the first time
 423  * during their thread exit.
 424  *
 425  * This routine is called directly from pthreads.
 426  */
 427 static void _nmalloc_thr_init_once(void);
 428 static void _nmalloc_thr_destructor(void *thrp);
 429
 430 void
 431 _nmalloc_thr_init(void)
 432 {
 433         static int did_init;
 434
 435         TAILQ_INIT(&slglobal.full_zones);
 436
 437         if (slglobal.masked)
 438                 return;
 439
 440         slglobal.masked = 1;
 441         if (did_init == 0) {
 442                 did_init = 1;
 443                 pthread_once(&thread_malloc_once, _nmalloc_thr_init_once);
 444         }
 445         pthread_setspecific(thread_malloc_key, &slglobal);
 446         slglobal.masked = 0;
 447 }
 448
 449 void
 450 _nmalloc_thr_prepfork(void)
 451 {
 452         if (__isthreaded)
 453                 _SPINLOCK(&slglobaldepot.lock);
 454 }
 455
 456 void
 457 _nmalloc_thr_parentfork(void)
 458 {
 459         if (__isthreaded)
 460                 _SPINUNLOCK(&slglobaldepot.lock);
 461 }
 462
 463 void
 464 _nmalloc_thr_childfork(void)
 465 {
 466         if (__isthreaded)
 467                 _SPINUNLOCK(&slglobaldepot.lock);
 468 }
 469
 470 /*
 471  * Called just once
 472  */
 473 static void
 474 _nmalloc_thr_init_once(void)
 475 {
 476         int error;
 477
 478         error = pthread_key_create(&thread_malloc_key, _nmalloc_thr_destructor);
 479         if (error)
 480                 abort();
 481 }
 482
 483 /*
 484  * Called for each thread undergoing exit
 485  *
 486  * Move all of the thread's slabs into a depot.
 487  */
 488 static void
 489 _nmalloc_thr_destructor(void *thrp)
 490 {
 491         slglobaldata_t slgd = thrp;
 492         struct zoneinfo *zinfo;
 493         slab_t slab;
 494         void *ptr;
 495         int i;
 496         int j;
 497
 498         slgd->masked = 1;
 499
 500         for (i = 0; i <= slgd->biggest_index; i++) {
 501                 zinfo = &slgd->zone[i];
 502
 503                 while ((j = zinfo->mag_index) > 0) {
 504                         --j;
 505                         ptr = zinfo->mag_shortcut[j];
 506                         zinfo->mag_shortcut[j] = NULL;  /* SAFETY */
 507                         zinfo->mag_index = j;
 508                         memfree(ptr, 0);
 509                 }
 510
 511                 while ((slab = zinfo->empty_base) != NULL) {
 512                         zinfo->empty_base = slab->next;
 513                         --zinfo->empty_count;
 514                         slabterm(slgd, slab);
 515                 }
 516
 517                 while ((slab = zinfo->avail_base) != NULL) {
 518                         zinfo->avail_base = slab->next;
 519                         --zinfo->avail_count;
 520                         slabterm(slgd, slab);
 521                 }
 522
 523                 while ((slab = TAILQ_FIRST(&slgd->full_zones)) != NULL) {
 524                         TAILQ_REMOVE(&slgd->full_zones, slab, entry);
 525                         slabterm(slgd, slab);
 526                 }
 527         }
 528 }
 529
 530 /*
 531  * Calculate the zone index for the allocation request size and set the
 532  * allocation request size to that particular zone's chunk size.
 533  *
 534  * Minimum alignment is 16 bytes for allocations >= 16 bytes to conform
 535  * with malloc requirements for intel/amd.
 536  */
 537 static __inline int
 538 zoneindex(size_t *bytes, size_t *chunking)
 539 {
 540         size_t n = (size_t)*bytes;
 541         size_t x;
 542         size_t c;
 543         int i;
 544
 545         if (n < 128) {
 546                 if (n < 16) {
 547                         *bytes = n = (n + 7) & ~7;
 548                         *chunking = 8;
 549                         return(n / 8 - 1);      /* 8 byte chunks, 2 zones */
 550                 } else {
 551                         *bytes = n = (n + 15) & ~15;
 552                         *chunking = 16;
 553                         return(n / 16 + 2);     /* 16 byte chunks, 8 zones */
 554                 }
 555         }
 556         if (n < 4096) {
 557                 x = 256;
 558                 c = x / (CHUNKFACTOR * 2);
 559                 i = 16;
 560         } else {
 561                 x = 8192;
 562                 c = x / (CHUNKFACTOR * 2);
 563                 i = 16 + CHUNKFACTOR * 5;  /* 256->512,1024,2048,4096,8192 */
 564         }
 565         while (n >= x) {
 566                 x <<= 1;
 567                 c <<= 1;
 568                 i += CHUNKFACTOR;
 569                 if (x == 0)
 570                         _mpanic("slaballoc: byte value too high");
 571         }
 572         *bytes = n = roundup2(n, c);
 573         *chunking = c;
 574         return (i + n / c - CHUNKFACTOR);
 575 #if 0
 576         *bytes = n = (n + c - 1) & ~(c - 1);
 577         *chunking = c;
 578         return (n / c + i);
 579
 580         if (n < 256) {
 581                 *bytes = n = (n + 15) & ~15;
 582                 *chunking = 16;
 583                 return(n / (CHUNKINGLO*2) + CHUNKINGLO*1 - 1);
 584         }
 585         if (n < 8192) {
 586                 if (n < 512) {
 587                         *bytes = n = (n + 31) & ~31;
 588                         *chunking = 32;
 589                         return(n / (CHUNKINGLO*4) + CHUNKINGLO*2 - 1);
 590                 }
 591                 if (n < 1024) {
 592                         *bytes = n = (n + 63) & ~63;
 593                         *chunking = 64;
 594                         return(n / (CHUNKINGLO*8) + CHUNKINGLO*3 - 1);
 595                 }
 596                 if (n < 2048) {
 597                         *bytes = n = (n + 127) & ~127;
 598                         *chunking = 128;
 599                         return(n / (CHUNKINGLO*16) + CHUNKINGLO*4 - 1);
 600                 }
 601                 if (n < 4096) {
 602                         *bytes = n = (n + 255) & ~255;
 603                         *chunking = 256;
 604                         return(n / (CHUNKINGLO*32) + CHUNKINGLO*5 - 1);
 605                 }
 606                 *bytes = n = (n + 511) & ~511;
 607                 *chunking = 512;
 608                 return(n / (CHUNKINGLO*64) + CHUNKINGLO*6 - 1);
 609         }
 610         if (n < 16384) {
 611                 *bytes = n = (n + 1023) & ~1023;
 612                 *chunking = 1024;
 613                 return(n / (CHUNKINGLO*128) + CHUNKINGLO*7 - 1);
 614         }
 615         if (n < 32768) {                                /* 16384-32767 */
 616                 *bytes = n = (n + 2047) & ~2047;
 617                 *chunking = 2048;
 618                 return(n / (CHUNKINGLO*256) + CHUNKINGLO*8 - 1);
 619         }
 620         if (n < 65536) {
 621                 *bytes = n = (n + 4095) & ~4095;        /* 32768-65535 */
 622                 *chunking = 4096;
 623                 return(n / (CHUNKINGLO*512) + CHUNKINGLO*9 - 1);
 624         }
 625
 626         x = 131072;
 627         c = 8192;
 628         i = CHUNKINGLO*10 - 1;
 629
 630         while (n >= x) {
 631                 x <<= 1;
 632                 c <<= 1;
 633                 i += CHUNKINGHI;
 634                 if (x == 0)
 635                         _mpanic("slaballoc: byte value too high");
 636         }
 637         *bytes = n = (n + c - 1) & ~(c - 1);
 638         *chunking = c;
 639         return (n / c + i);
 640 #endif
 641 }
 642
 643 /*
 644  * malloc() - call internal slab allocator
 645  */
 646 void *
 647 __malloc(size_t size)
 648 {
 649         void *ptr;
 650
 651         ptr = memalloc(size, 0);
 652         if (ptr == NULL)
 653                 errno = ENOMEM;
 654         else
 655                 UTRACE(0, size, ptr);
 656         return(ptr);
 657 }
 658
 659 /*
 660  * calloc() - call internal slab allocator
 661  */
 662 void *
 663 __calloc(size_t number, size_t size)
 664 {
 665         void *ptr;
 666
 667         ptr = memalloc(number * size, SAFLAG_ZERO);
 668         if (ptr == NULL)
 669                 errno = ENOMEM;
 670         else
 671                 UTRACE(0, number * size, ptr);
 672         return(ptr);
 673 }
 674
 675 /*
 676  * realloc() (SLAB ALLOCATOR)
 677  *
 678  * We do not attempt to optimize this routine beyond reusing the same
 679  * pointer if the new size fits within the chunking of the old pointer's
 680  * zone.
 681  */
 682 void *
 683 __realloc(void *ptr, size_t size)
 684 {
 685         void *ret;
 686
 687         if (ptr == NULL)
 688                 ret = memalloc(size, 0);
 689         else
 690                 ret = memrealloc(ptr, size);
 691         if (ret == NULL)
 692                 errno = ENOMEM;
 693         else
 694                 UTRACE(ptr, size, ret);
 695         return(ret);
 696 }
 697
 698 /*
 699  * aligned_alloc()
 700  *
 701  * Allocate (size) bytes with a alignment of (alignment).
 702  */
 703 void *
 704 __aligned_alloc(size_t alignment, size_t size)
 705 {
 706         void *ptr;
 707         int rc;
 708
 709         ptr = NULL;
 710         rc = memalign(&ptr, alignment, size);
 711         if (rc)
 712                 errno = rc;
 713
 714         return (ptr);
 715 }
 716
 717 /*
 718  * posix_memalign()
 719  *
 720  * Allocate (size) bytes with a alignment of (alignment), where (alignment)
 721  * is a power of 2 >= sizeof(void *).
 722  */
 723 int
 724 __posix_memalign(void **memptr, size_t alignment, size_t size)
 725 {
 726         int rc;
 727
 728         /*
 729          * OpenGroup spec issue 6 check
 730          */
 731         if (alignment < sizeof(void *)) {
 732                 *memptr = NULL;
 733                 return(EINVAL);
 734         }
 735
 736         rc = memalign(memptr, alignment, size);
 737
 738         return (rc);
 739 }
 740
 741 /*
 742  * The slab allocator will allocate on power-of-2 boundaries up to at least
 743  * PAGE_SIZE.  Otherwise we use the zoneindex mechanic to find a zone
 744  * matching the requirements.
 745  */
 746 static int
 747 memalign(void **memptr, size_t alignment, size_t size)
 748 {
 749
 750         if (alignment < 1) {
 751                 *memptr = NULL;
 752                 return(EINVAL);
 753         }
 754
 755         /*
 756          * OpenGroup spec issue 6 check
 757          */
 758         if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) {
 759                 *memptr = NULL;
 760                 return(EINVAL);
 761         }
 762
 763         /*
 764          * XXX for now just find the nearest power of 2 >= size and also
 765          * >= alignment and allocate that.
 766          */
 767         while (alignment < size) {
 768                 alignment <<= 1;
 769                 if (alignment == 0)
 770                         _mpanic("posix_memalign: byte value too high");
 771         }
 772         *memptr = memalloc(alignment, 0);
 773         return(*memptr ? 0 : ENOMEM);
 774 }
 775
 776 /*
 777  * free() (SLAB ALLOCATOR) - do the obvious
 778  */
 779 void
 780 __free(void *ptr)
 781 {
 782         if (ptr) {
 783                 UTRACE(ptr, 0, 0);
 784                 memfree(ptr, 0);
 785         }
 786 }
 787
 788 /*
 789  * memalloc()   (SLAB ALLOCATOR)
 790  *
 791  *      Allocate memory via the slab allocator.
 792  */
 793 static void *
 794 memalloc(size_t size, int flags)
 795 {
 796         slglobaldata_t slgd;
 797         struct zoneinfo *zinfo;
 798         slab_t slab;
 799         size_t chunking;
 800         int bmi;
 801         int bno;
 802         u_long *bmp;
 803         int zi;
 804 #ifdef INVARIANTS
 805         int i;
 806 #endif
 807         int j;
 808         char *obj;
 809
 810         /*
 811          * If 0 bytes is requested we have to return a unique pointer, allocate
 812          * at least one byte.
 813          */
 814         if (size == 0)
 815                 size = 1;
 816
 817         /* Capture global flags */
 818         flags |= g_malloc_flags;
 819
 820         /* Compute allocation zone; zoneindex will panic on excessive sizes */
 821         zi = zoneindex(&size, &chunking);
 822         MASSERT(zi < NZONES);
 823         if (size == 0)
 824                 return(NULL);
 825
 826         /*
 827          * Try magazine shortcut first
 828          */
 829         slgd = &slglobal;
 830         zinfo = &slgd->zone[zi];
 831
 832         if ((j = zinfo->mag_index) != 0) {
 833                 zinfo->mag_index = --j;
 834                 obj = zinfo->mag_shortcut[j];
 835                 zinfo->mag_shortcut[j] = NULL;  /* SAFETY */
 836                 if (flags & SAFLAG_ZERO)
 837                         bzero(obj, size);
 838                 return obj;
 839         }
 840
 841         /*
 842          * Locate a slab with available space.  If no slabs are available
 843          * back-off to the empty list and if we still come up dry allocate
 844          * a new slab (which will try the depot first).
 845          */
 846 retry:
 847         if ((slab = zinfo->avail_base) == NULL) {
 848                 if ((slab = zinfo->empty_base) == NULL) {
 849                         /*
 850                          * Still dry
 851                          */
 852                         slab = slaballoc(zi, chunking, size);
 853                         if (slab == NULL)
 854                                 return(NULL);
 855                         slab->next = zinfo->avail_base;
 856                         zinfo->avail_base = slab;
 857                         ++zinfo->avail_count;
 858                         slab->state = AVAIL;
 859                         if (slgd->biggest_index < zi)
 860                                 slgd->biggest_index = zi;
 861                         ++slgd->nslabs;
 862                 } else {
 863                         /*
 864                          * Pulled from empty list
 865                          */
 866                         zinfo->empty_base = slab->next;
 867                         slab->next = zinfo->avail_base;
 868                         zinfo->avail_base = slab;
 869                         ++zinfo->avail_count;
 870                         slab->state = AVAIL;
 871                         --zinfo->empty_count;
 872                 }
 873         }
 874
 875         /*
 876          * Allocate a chunk out of the slab.  HOT PATH
 877          *
 878          * Only the thread owning the slab can allocate out of it.
 879          *
 880          * NOTE: The last bit in the bitmap is always marked allocated so
 881          *       we cannot overflow here.
 882          */
 883         bno = slab->free_bit;
 884         bmi = slab->free_index;
 885         bmp = &slab->bitmap[bmi];
 886         if (*bmp & (1LU << bno)) {
 887                 atomic_clear_long(bmp, 1LU << bno);
 888                 obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size;
 889                 slab->free_bit = (bno + 1) & (LONG_BITS - 1);
 890                 atomic_add_int(&slab->navail, -1);
 891                 if (flags & SAFLAG_ZERO)
 892                         bzero(obj, size);
 893                 return (obj);
 894         }
 895
 896         /*
 897          * Allocate a chunk out of a slab.  COLD PATH
 898          */
 899         if (slab->navail == 0) {
 900                 zinfo->avail_base = slab->next;
 901                 --zinfo->avail_count;
 902                 slab->state = FULL;
 903                 TAILQ_INSERT_TAIL(&slgd->full_zones, slab, entry);
 904                 goto retry;
 905         }
 906
 907         while (bmi < LONG_BITS) {
 908                 bmp = &slab->bitmap[bmi];
 909                 if (*bmp) {
 910                         bno = bsflong(*bmp);
 911                         atomic_clear_long(bmp, 1LU << bno);
 912                         obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
 913                                              size;
 914                         slab->free_index = bmi;
 915                         slab->free_bit = (bno + 1) & (LONG_BITS - 1);
 916                         atomic_add_int(&slab->navail, -1);
 917                         if (flags & SAFLAG_ZERO)
 918                                 bzero(obj, size);
 919                         return (obj);
 920                 }
 921                 ++bmi;
 922         }
 923         bmi = 0;
 924         while (bmi < LONG_BITS) {
 925                 bmp = &slab->bitmap[bmi];
 926                 if (*bmp) {
 927                         bno = bsflong(*bmp);
 928                         atomic_clear_long(bmp, 1LU << bno);
 929                         obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
 930                                              size;
 931                         slab->free_index = bmi;
 932                         slab->free_bit = (bno + 1) & (LONG_BITS - 1);
 933                         atomic_add_int(&slab->navail, -1);
 934                         if (flags & SAFLAG_ZERO)
 935                                 bzero(obj, size);
 936                         return (obj);
 937                 }
 938                 ++bmi;
 939         }
 940         _mpanic("slaballoc: corrupted zone: navail %d", slab->navail);
 941         /* not reached */
 942         return NULL;
 943 }
 944
 945 /*
 946  * Reallocate memory within the chunk
 947  */
 948 static void *
 949 memrealloc(void *ptr, size_t nsize)
 950 {
 951         region_t region;
 952         slab_t slab;
 953         size_t osize;
 954         char *obj;
 955         int flags = 0;
 956
 957         /*
 958          * If 0 bytes is requested we have to return a unique pointer, allocate
 959          * at least one byte.
 960          */
 961         if (nsize == 0)
 962                 nsize = 1;
 963
 964         /* Capture global flags */
 965         flags |= g_malloc_flags;
 966
 967         /*
 968          * Locate the zone by looking up the dynamic slab size mask based
 969          * on the memory region the allocation resides in.
 970          */
 971         region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
 972         if ((slab = region->slab) == NULL)
 973                 slab = (void *)((uintptr_t)ptr & region->mask);
 974         MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
 975         osize = slab->chunk_size;
 976         if (nsize <= osize) {
 977                 if (osize < 32 || nsize >= osize / 2) {
 978                         obj = ptr;
 979                         if ((flags & SAFLAG_ZERO) && nsize < osize)
 980                                 bzero(obj + nsize, osize - nsize);
 981                         return(obj);
 982                 }
 983         }
 984
 985         /*
 986          * Otherwise resize the object
 987          */
 988         obj = memalloc(nsize, 0);
 989         if (obj) {
 990                 if (nsize > osize)
 991                         nsize = osize;
 992                 bcopy(ptr, obj, nsize);
 993                 memfree(ptr, 0);
 994         }
 995         return (obj);
 996 }
 997
 998 /*
 999  * free (SLAB ALLOCATOR)
1000  *
1001  * Free a memory block previously allocated by malloc.
1002  *
1003  * MPSAFE
1004  */
1005 static void
1006 memfree(void *ptr, int flags)
1007 {
1008         region_t region;
1009         slglobaldata_t slgd;
1010         slab_t slab;
1011         slab_t stmp;
1012         slab_t *slabp;
1013         int bmi;
1014         int bno;
1015         int j;
1016         u_long *bmp;
1017
1018         /*
1019          * Locate the zone by looking up the dynamic slab size mask based
1020          * on the memory region the allocation resides in.
1021          *
1022          * WARNING!  The slab may be owned by another thread!
1023          */
1024         region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
1025         if ((slab = region->slab) == NULL)
1026                 slab = (void *)((uintptr_t)ptr & region->mask);
1027         MASSERT(slab != NULL);
1028         MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
1029
1030 #ifdef INVARIANTS
1031         /*
1032          * Put weird data into the memory to detect modifications after
1033          * freeing, illegal pointer use after freeing (we should fault on
1034          * the odd address), and so forth.
1035          */
1036         if (slab->chunk_size < sizeof(weirdary))
1037                 bcopy(weirdary, ptr, slab->chunk_size);
1038         else
1039                 bcopy(weirdary, ptr, sizeof(weirdary));
1040 #endif
1041         slgd = &slglobal;
1042
1043         /*
1044          * Use mag_shortcut[] when possible
1045          */
1046         if (slgd->masked == 0 && slab->chunk_size <= NOMSLABSIZE) {
1047                 struct zoneinfo *zinfo;
1048
1049                 zinfo = &slgd->zone[slab->zone_index];
1050                 j = zinfo->mag_index;
1051                 if (j < NMAGSHORTCUT) {
1052                         zinfo->mag_shortcut[j] = ptr;
1053                         zinfo->mag_index = j + 1;
1054                         return;
1055                 }
1056         }
1057
1058         /*
1059          * Free to slab and increment navail.  We can delay incrementing
1060          * navail to prevent the slab from being destroyed out from under
1061          * us while we do other optimizations.
1062          */
1063         bno = ((uintptr_t)ptr - (uintptr_t)slab->chunks) / slab->chunk_size;
1064         bmi = bno >> LONG_BITS_SHIFT;
1065         bno &= (LONG_BITS - 1);
1066         bmp = &slab->bitmap[bmi];
1067
1068         MASSERT(bmi >= 0 && bmi < slab->nmax);
1069         MASSERT((*bmp & (1LU << bno)) == 0);
1070         atomic_set_long(bmp, 1LU << bno);
1071
1072         if (slab->slgd == slgd) {
1073                 /*
1074                  * We can only do the following if we own the slab.  Note
1075                  * that navail can be incremented by any thread even if
1076                  * we own the slab.
1077                  */
1078                 struct zoneinfo *zinfo;
1079
1080                 atomic_add_int(&slab->navail, 1);
1081                 if (slab->free_index > bmi) {
1082                         slab->free_index = bmi;
1083                         slab->free_bit = bno;
1084                 } else if (slab->free_index == bmi &&
1085                            slab->free_bit > bno) {
1086                         slab->free_bit = bno;
1087                 }
1088                 zinfo = &slgd->zone[slab->zone_index];
1089
1090                 /*
1091                  * Freeing an object from a full slab makes it less than
1092                  * full.  The slab must be moved to the available list.
1093                  *
1094                  * If the available list has too many slabs, release some
1095                  * to the depot.
1096                  */
1097                 if (slab->state == FULL) {
1098                         TAILQ_REMOVE(&slgd->full_zones, slab, entry);
1099                         slab->state = AVAIL;
1100                         stmp = zinfo->avail_base;
1101                         slab->next = stmp;
1102                         zinfo->avail_base = slab;
1103                         ++zinfo->avail_count;
1104                         while (zinfo->avail_count > opt_cache) {
1105                                 slab = zinfo->avail_base;
1106                                 zinfo->avail_base = slab->next;
1107                                 --zinfo->avail_count;
1108                                 slabterm(slgd, slab);
1109                         }
1110                         goto done;
1111                 }
1112
1113                 /*
1114                  * If the slab becomes completely empty dispose of it in
1115                  * some manner.  By default each thread caches up to 4
1116                  * empty slabs.  Only small slabs are cached.
1117                  */
1118                 if (slab->navail == slab->nmax && slab->state == AVAIL) {
1119                         /*
1120                          * Remove slab from available queue
1121                          */
1122                         slabp = &zinfo->avail_base;
1123                         while ((stmp = *slabp) != slab)
1124                                 slabp = &stmp->next;
1125                         *slabp = slab->next;
1126                         --zinfo->avail_count;
1127
1128                         if (opt_free || opt_cache == 0) {
1129                                 /*
1130                                  * If local caching is disabled cache the
1131                                  * slab in the depot (or free it).
1132                                  */
1133                                 slabterm(slgd, slab);
1134                         } else if (slab->slab_size > BIGSLABSIZE) {
1135                                 /*
1136                                  * We do not try to retain large slabs
1137                                  * in per-thread caches.
1138                                  */
1139                                 slabterm(slgd, slab);
1140                         } else if (zinfo->empty_count > opt_cache) {
1141                                 /*
1142                                  * We have too many slabs cached, but
1143                                  * instead of freeing this one free
1144                                  * an empty slab that's been idle longer.
1145                                  *
1146                                  * (empty_count does not change)
1147                                  */
1148                                 stmp = zinfo->empty_base;
1149                                 slab->state = EMPTY;
1150                                 slab->next = stmp->next;
1151                                 zinfo->empty_base = slab;
1152                                 slabterm(slgd, stmp);
1153                         } else {
1154                                 /*
1155                                  * Cache the empty slab in our thread local
1156                                  * empty list.
1157                                  */
1158                                 ++zinfo->empty_count;
1159                                 slab->state = EMPTY;
1160                                 slab->next = zinfo->empty_base;
1161                                 zinfo->empty_base = slab;
1162                         }
1163                 }
1164         } else if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1165                 slglobaldata_t sldepot;
1166
1167                 /*
1168                  * If freeing to a slab owned by the global depot, and
1169                  * the slab becomes completely EMPTY, try to move it to
1170                  * the correct list.
1171                  */
1172                 sldepot = &slglobaldepot;
1173                 if (__isthreaded)
1174                         _SPINLOCK(&sldepot->lock);
1175                 if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1176                         struct zoneinfo *zinfo;
1177
1178                         /*
1179                          * Move the slab to the empty list
1180                          */
1181                         MASSERT(slab->state == AVAIL);
1182                         atomic_add_int(&slab->navail, 1);
1183                         zinfo = &sldepot->zone[slab->zone_index];
1184                         slabp = &zinfo->avail_base;
1185                         while (slab != *slabp)
1186                                 slabp = &(*slabp)->next;
1187                         *slabp = slab->next;
1188                         --zinfo->avail_count;
1189
1190                         /*
1191                          * Clean out excessive empty entries from the
1192                          * depot.
1193                          */
1194                         slab->state = EMPTY;
1195                         slab->next = zinfo->empty_base;
1196                         zinfo->empty_base = slab;
1197                         ++zinfo->empty_count;
1198                         while (zinfo->empty_count > opt_cache) {
1199                                 slab = zinfo->empty_base;
1200                                 zinfo->empty_base = slab->next;
1201                                 --zinfo->empty_count;
1202                                 slab->state = UNKNOWN;
1203                                 if (__isthreaded)
1204                                         _SPINUNLOCK(&sldepot->lock);
1205                                 slabfree(slab);
1206                                 if (__isthreaded)
1207                                         _SPINLOCK(&sldepot->lock);
1208                         }
1209                 } else {
1210                         atomic_add_int(&slab->navail, 1);
1211                 }
1212                 if (__isthreaded)
1213                         _SPINUNLOCK(&sldepot->lock);
1214         } else {
1215                 /*
1216                  * We can't act on the slab other than by adjusting navail
1217                  * (and the bitmap which we did in the common code at the
1218                  * top).
1219                  */
1220                 atomic_add_int(&slab->navail, 1);
1221         }
1222 done:
1223         ;
1224 }
1225
1226 /*
1227  * Allocate a new slab holding objects of size chunk_size.
1228  */
1229 static slab_t
1230 slaballoc(int zi, size_t chunking, size_t chunk_size)
1231 {
1232         slglobaldata_t slgd;
1233         slglobaldata_t sldepot;
1234         struct zoneinfo *zinfo;
1235         region_t region;
1236         void *save;
1237         slab_t slab;
1238         size_t slab_desire;
1239         size_t slab_size;
1240         size_t region_mask;
1241         uintptr_t chunk_offset;
1242         ssize_t maxchunks;
1243         ssize_t tmpchunks;
1244         int ispower2;
1245         int power;
1246         int ri;
1247         int rx;
1248         int nswath;
1249         int j;
1250
1251         /*
1252          * First look in the depot.  Any given zone in the depot may be
1253          * locked by being set to -1.  We have to do this instead of simply
1254          * removing the entire chain because removing the entire chain can
1255          * cause racing threads to allocate local slabs for large objects,
1256          * resulting in a large VSZ.
1257          */
1258         slgd = &slglobal;
1259         sldepot = &slglobaldepot;
1260         zinfo = &sldepot->zone[zi];
1261
1262         if (zinfo->avail_base) {
1263                 if (__isthreaded)
1264                         _SPINLOCK(&sldepot->lock);
1265                 slab = zinfo->avail_base;
1266                 if (slab) {
1267                         MASSERT(slab->slgd == NULL);
1268                         slab->slgd = slgd;
1269                         zinfo->avail_base = slab->next;
1270                         --zinfo->avail_count;
1271                         if (__isthreaded)
1272                                 _SPINUNLOCK(&sldepot->lock);
1273                         return slab;
1274                 }
1275                 if (__isthreaded)
1276                         _SPINUNLOCK(&sldepot->lock);
1277         }
1278
1279         /*
1280          * Nothing in the depot, allocate a new slab by locating or assigning
1281          * a region and then using the system virtual memory allocator.
1282          */
1283         slab = NULL;
1284
1285         /*
1286          * Calculate the start of the data chunks relative to the start
1287          * of the slab.  If chunk_size is a power of 2 we guarantee
1288          * power of 2 alignment.  If it is not we guarantee alignment
1289          * to the chunk size.
1290          */
1291         if ((chunk_size ^ (chunk_size - 1)) == (chunk_size << 1) - 1) {
1292                 ispower2 = 1;
1293                 chunk_offset = roundup2(sizeof(*slab), chunk_size);
1294         } else {
1295                 ispower2 = 0;
1296                 chunk_offset = sizeof(*slab) + chunking - 1;
1297                 chunk_offset -= chunk_offset % chunking;
1298         }
1299
1300         /*
1301          * Calculate a reasonable number of chunks for the slab.
1302          *
1303          * Once initialized the MaxChunks[] array can only ever be
1304          * reinitialized to the same value.
1305          */
1306         maxchunks = MaxChunks[zi];
1307         if (maxchunks == 0) {
1308                 /*
1309                  * First calculate how many chunks would fit in 1/1024
1310                  * available memory.  This is around 2MB on a 32 bit
1311                  * system and 128G on a 64-bit (48-bits available) system.
1312                  */
1313                 maxchunks = (ssize_t)(NREGIONS_SIZE - chunk_offset) /
1314                             (ssize_t)chunk_size;
1315                 if (maxchunks <= 0)
1316                         maxchunks = 1;
1317
1318                 /*
1319                  * A slab cannot handle more than MAXCHUNKS chunks, but
1320                  * limit us to approximately MAXCHUNKS / 2 here because
1321                  * we may have to expand maxchunks when we calculate the
1322                  * actual power-of-2 slab.
1323                  */
1324                 if (maxchunks > MAXCHUNKS / 2)
1325                         maxchunks = MAXCHUNKS / 2;
1326
1327                 /*
1328                  * Try to limit the slabs to BIGSLABSIZE (~128MB).  Larger
1329                  * slabs will be created if the allocation does not fit.
1330                  */
1331                 if (chunk_offset + chunk_size * maxchunks > BIGSLABSIZE) {
1332                         tmpchunks = (ssize_t)(BIGSLABSIZE - chunk_offset) /
1333                                     (ssize_t)chunk_size;
1334                         if (tmpchunks <= 0)
1335                                 tmpchunks = 1;
1336                         if (maxchunks > tmpchunks)
1337                                 maxchunks = tmpchunks;
1338                 }
1339
1340                 /*
1341                  * If the slab calculates to greater than 2MB see if we
1342                  * can cut it down to ~2MB.  This controls VSZ but has
1343                  * no effect on run-time size or performance.
1344                  *
1345                  * This is very important in case you core dump and also
1346                  * important to reduce unnecessary region allocations.
1347                  */
1348                 if (chunk_offset + chunk_size * maxchunks > NOMSLABSIZE) {
1349                         tmpchunks = (ssize_t)(NOMSLABSIZE - chunk_offset) /
1350                                     (ssize_t)chunk_size;
1351                         if (tmpchunks < 1)
1352                                 tmpchunks = 1;
1353                         if (maxchunks > tmpchunks)
1354                                 maxchunks = tmpchunks;
1355                 }
1356
1357                 /*
1358                  * If the slab calculates to greater than 128K see if we
1359                  * can cut it down to ~128K while still maintaining a
1360                  * reasonably large number of chunks in each slab.  This
1361                  * controls VSZ but has no effect on run-time size or
1362                  * performance.
1363                  *
1364                  * This is very important in case you core dump and also
1365                  * important to reduce unnecessary region allocations.
1366                  */
1367                 if (chunk_offset + chunk_size * maxchunks > LITSLABSIZE) {
1368                         tmpchunks = (ssize_t)(LITSLABSIZE - chunk_offset) /
1369                                     (ssize_t)chunk_size;
1370                         if (tmpchunks < 32)
1371                                 tmpchunks = 32;
1372                         if (maxchunks > tmpchunks)
1373                                 maxchunks = tmpchunks;
1374                 }
1375
1376                 MaxChunks[zi] = maxchunks;
1377         }
1378         MASSERT(maxchunks > 0 && maxchunks <= MAXCHUNKS);
1379
1380         /*
1381          * Calculate the actual slab size.  maxchunks will be recalculated
1382          * a little later.
1383          */
1384         slab_desire = chunk_offset + chunk_size * maxchunks;
1385         slab_size = 8 * MAXCHUNKS;
1386         power = 3 + MAXCHUNKS_BITS;
1387         while (slab_size < slab_desire) {
1388                 slab_size <<= 1;
1389                 ++power;
1390         }
1391
1392         /*
1393          * Do a quick recalculation based on the actual slab size but not
1394          * yet dealing with whether the slab header is in-band or out-of-band.
1395          * The purpose here is to see if we can reasonably reduce slab_size
1396          * to a power of 4 to allow more chunk sizes to use the same slab
1397          * size.
1398          */
1399         if ((power & 1) && slab_size > 32768) {
1400                 maxchunks = (slab_size - chunk_offset) / chunk_size;
1401                 if (maxchunks >= MAXCHUNKS / 8) {
1402                         slab_size >>= 1;
1403                         --power;
1404                 }
1405         }
1406         if ((power & 2) && slab_size > 32768 * 4) {
1407                 maxchunks = (slab_size - chunk_offset) / chunk_size;
1408                 if (maxchunks >= MAXCHUNKS / 4) {
1409                         slab_size >>= 2;
1410                         power -= 2;
1411                 }
1412         }
1413         /*
1414          * This case occurs when the slab_size is larger than 1/1024 available
1415          * UVM.
1416          */
1417         nswath = slab_size / NREGIONS_SIZE;
1418         if (nswath > NREGIONS)
1419                 return (NULL);
1420
1421
1422         /*
1423          * Try to allocate from our current best region for this zi
1424          */
1425         region_mask = ~(slab_size - 1);
1426         ri = slgd->zone[zi].best_region;
1427         if (Regions[ri].mask == region_mask) {
1428                 if ((slab = _vmem_alloc(ri, slab_size)) != NULL)
1429                         goto found;
1430         }
1431
1432         /*
1433          * Try to find an existing region to allocate from.  The normal
1434          * case will be for allocations that are less than 1/1024 available
1435          * UVM, which fit into a single Regions[] entry.
1436          */
1437         while (slab_size <= NREGIONS_SIZE) {
1438                 rx = -1;
1439                 for (ri = 0; ri < NREGIONS; ++ri) {
1440                         if (rx < 0 && Regions[ri].mask == 0)
1441                                 rx = ri;
1442                         if (Regions[ri].mask == region_mask) {
1443                                 slab = _vmem_alloc(ri, slab_size);
1444                                 if (slab) {
1445                                         slgd->zone[zi].best_region = ri;
1446                                         goto found;
1447                                 }
1448                         }
1449                 }
1450
1451                 if (rx < 0)
1452                         return(NULL);
1453
1454                 /*
1455                  * This can fail, retry either way
1456                  */
1457                 atomic_cmpset_ptr((void **)&Regions[rx].mask,
1458                                   NULL,
1459                                   (void *)region_mask);
1460         }
1461
1462         for (;;) {
1463                 rx = -1;
1464                 for (ri = 0; ri < NREGIONS; ri += nswath) {
1465                         if (Regions[ri].mask == region_mask) {
1466                                 slab = _vmem_alloc(ri, slab_size);
1467                                 if (slab) {
1468                                         slgd->zone[zi].best_region = ri;
1469                                         goto found;
1470                                 }
1471                         }
1472                         if (rx < 0) {
1473                                 for (j = nswath - 1; j >= 0; --j) {
1474                                         if (Regions[ri+j].mask != 0)
1475                                                 break;
1476                                 }
1477                                 if (j < 0)
1478                                         rx = ri;
1479                         }
1480                 }
1481
1482                 /*
1483                  * We found a candidate, try to allocate it backwards so
1484                  * another thread racing a slaballoc() does not see the
1485                  * mask in the base index position until we are done.
1486                  *
1487                  * We can safely zero-out any partial allocations because
1488                  * the mask is only accessed from the base index.  Any other
1489                  * threads racing us will fail prior to us clearing the mask.
1490                  */
1491                 if (rx < 0)
1492                         return(NULL);
1493                 for (j = nswath - 1; j >= 0; --j) {
1494                         if (!atomic_cmpset_ptr((void **)&Regions[rx+j].mask,
1495                                                NULL, (void *)region_mask)) {
1496                                 while (++j < nswath)
1497                                         Regions[rx+j].mask = 0;
1498                                 break;
1499                         }
1500                 }
1501                 /* retry */
1502         }
1503
1504         /*
1505          * Fill in the new slab in region ri.  If the slab_size completely
1506          * fills one or more region slots we move the slab structure out of
1507          * band which should optimize the chunking (particularly for a power
1508          * of 2).
1509          */
1510 found:
1511         region = &Regions[ri];
1512         MASSERT(region->slab == NULL);
1513         if (slab_size >= NREGIONS_SIZE) {
1514                 save = slab;
1515                 slab = memalloc(sizeof(*slab), 0);
1516                 bzero(slab, sizeof(*slab));
1517                 slab->chunks = save;
1518                 for (j = 0; j < nswath; ++j)
1519                         region[j].slab = slab;
1520                 chunk_offset = 0;
1521         } else {
1522                 bzero(slab, sizeof(*slab));
1523                 slab->chunks = (char *)slab + chunk_offset;
1524         }
1525
1526         /*
1527          * Calculate the start of the chunks memory and recalculate the
1528          * actual number of chunks the slab can hold.
1529          */
1530         maxchunks = (slab_size - chunk_offset) / chunk_size;
1531         if (maxchunks > MAXCHUNKS)
1532                 maxchunks = MAXCHUNKS;
1533
1534         /*
1535          * And fill in the rest
1536          */
1537         slab->magic = ZALLOC_SLAB_MAGIC;
1538         slab->navail = maxchunks;
1539         slab->nmax = maxchunks;
1540         slab->slab_size = slab_size;
1541         slab->chunk_size = chunk_size;
1542         slab->zone_index = zi;
1543         slab->slgd = slgd;
1544         slab->state = UNKNOWN;
1545         slab->region = region;
1546
1547         for (ri = 0; ri < maxchunks; ri += LONG_BITS) {
1548                 if (ri + LONG_BITS <= maxchunks)
1549                         slab->bitmap[ri >> LONG_BITS_SHIFT] = ULONG_MAX;
1550                 else
1551                         slab->bitmap[ri >> LONG_BITS_SHIFT] =
1552                                                 (1LU << (maxchunks - ri)) - 1;
1553         }
1554         return (slab);
1555 }
1556
1557 /*
1558  * Free a slab.
1559  */
1560 static void
1561 slabfree(slab_t slab)
1562 {
1563         int nswath;
1564         int j;
1565
1566         if (slab->region->slab == slab) {
1567                 /*
1568                  * Out-of-band slab.
1569                  */
1570                 nswath = slab->slab_size / NREGIONS_SIZE;
1571                 for (j = 0; j < nswath; ++j)
1572                         slab->region[j].slab = NULL;
1573                 slab->magic = 0;
1574                 _vmem_free(slab->chunks, slab->slab_size);
1575                 memfree(slab, 0);
1576         } else {
1577                 /*
1578                  * In-band slab.
1579                  */
1580                 slab->magic = 0;
1581                 _vmem_free(slab, slab->slab_size);
1582         }
1583 }
1584
1585 /*
1586  * Terminate a slab's use in the current thread.  The slab may still have
1587  * outstanding allocations and thus not be deallocatable.
1588  */
1589 static void
1590 slabterm(slglobaldata_t slgd, slab_t slab)
1591 {
1592         slglobaldata_t sldepot;
1593         struct zoneinfo *zinfo;
1594         int zi = slab->zone_index;
1595
1596         slab->slgd = NULL;
1597         --slgd->nslabs;
1598         sldepot = &slglobaldepot;
1599         zinfo = &sldepot->zone[zi];
1600
1601         /*
1602          * Move the slab to the avail list or the empty list.
1603          */
1604         if (__isthreaded)
1605                 _SPINLOCK(&sldepot->lock);
1606         if (slab->navail == slab->nmax) {
1607                 slab->state = EMPTY;
1608                 slab->next = zinfo->empty_base;
1609                 zinfo->empty_base = slab;
1610                 ++zinfo->empty_count;
1611         } else {
1612                 slab->state = AVAIL;
1613                 slab->next = zinfo->avail_base;
1614                 zinfo->avail_base = slab;
1615                 ++zinfo->avail_count;
1616         }
1617
1618         /*
1619          * Clean extra slabs out of the empty list
1620          */
1621         while (zinfo->empty_count > opt_cache) {
1622                 slab = zinfo->empty_base;
1623                 zinfo->empty_base = slab->next;
1624                 --zinfo->empty_count;
1625                 slab->state = UNKNOWN;
1626                 if (__isthreaded)
1627                         _SPINUNLOCK(&sldepot->lock);
1628                 slabfree(slab);
1629                 if (__isthreaded)
1630                         _SPINLOCK(&sldepot->lock);
1631         }
1632         if (__isthreaded)
1633                 _SPINUNLOCK(&sldepot->lock);
1634 }
1635
1636 /*
1637  * _vmem_alloc()
1638  *
1639  *      Directly map memory in PAGE_SIZE'd chunks with the specified
1640  *      alignment.
1641  *
1642  *      Alignment must be a multiple of PAGE_SIZE.
1643  *
1644  *      Size must be >= alignment.
1645  */
1646 static void *
1647 _vmem_alloc(int ri, size_t slab_size)
1648 {
1649         char *baddr = (void *)((uintptr_t)ri << NREGIONS_SHIFT);
1650         char *eaddr;
1651         char *addr;
1652         char *save;
1653         uintptr_t excess;
1654
1655         if (slab_size < NREGIONS_SIZE)
1656                 eaddr = baddr + NREGIONS_SIZE;
1657         else
1658                 eaddr = baddr + slab_size;
1659
1660         /*
1661          * This usually just works but might not.
1662          */
1663         addr = mmap(baddr, slab_size, PROT_READ|PROT_WRITE,
1664                     MAP_PRIVATE | MAP_ANON | MAP_SIZEALIGN, -1, 0);
1665         if (addr == MAP_FAILED) {
1666                 return (NULL);
1667         }
1668         if (addr < baddr || addr + slab_size > eaddr) {
1669                 munmap(addr, slab_size);
1670                 return (NULL);
1671         }
1672
1673         /*
1674          * Check alignment.  The misaligned offset is also the excess
1675          * amount.  If misaligned unmap the excess so we have a chance of
1676          * mapping at the next alignment point and recursively try again.
1677          *
1678          * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB  block alignment
1679          *   aaaaaaaaa aaaaaaaaaaa aa           mis-aligned allocation
1680          *   xxxxxxxxx                          final excess calculation
1681          *   ^ returned address
1682          */
1683         excess = (uintptr_t)addr & (slab_size - 1);
1684         while (excess) {
1685                 excess = slab_size - excess;
1686                 save = addr;
1687
1688                 munmap(save + excess, slab_size - excess);
1689                 addr = _vmem_alloc(ri, slab_size);
1690                 munmap(save, excess);
1691                 if (addr == NULL)
1692                         return (NULL);
1693                 if (addr < baddr || addr + slab_size > eaddr) {
1694                         munmap(addr, slab_size);
1695                         return (NULL);
1696                 }
1697                 excess = (uintptr_t)addr & (slab_size - 1);
1698         }
1699         return (addr);
1700 }
1701
1702 /*
1703  * _vmem_free()
1704  *
1705  *      Free a chunk of memory allocated with _vmem_alloc()
1706  */
1707 static void
1708 _vmem_free(void *ptr, size_t size)
1709 {
1710         munmap(ptr, size);
1711 }
1712
1713 /*
1714  * Panic on fatal conditions
1715  */
1716 static void
1717 _mpanic(const char *ctl, ...)
1718 {
1719         va_list va;
1720
1721         if (malloc_panic == 0) {
1722                 malloc_panic = 1;
1723                 va_start(va, ctl);
1724                 vfprintf(stderr, ctl, va);
1725                 fprintf(stderr, "\n");
1726                 fflush(stderr);
1727                 va_end(va);
1728         }
1729         abort();
1730 }
1731
1732 __weak_reference(__aligned_alloc, aligned_alloc);
1733 __weak_reference(__malloc, malloc);
1734 __weak_reference(__calloc, calloc);
1735 __weak_reference(__posix_memalign, posix_memalign);
1736 __weak_reference(__realloc, realloc);
1737 __weak_reference(__free, free);