2 * DMALLOC.C - Dillon's malloc
4 * Copyright (c) 2011,2017 The DragonFly Project. All rights reserved.
6 * This code is derived from software contributed to The DragonFly Project
7 * by Matthew Dillon <dillon@backplane.com>.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
19 * 3. Neither the name of The DragonFly Project nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific, prior written permission.
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * This module implements a modified slab allocator as a drop-in replacement
38 * for the libc malloc(). The slab algorithm has been adjusted to support
39 * dynamic sizing of slabs which effectively allows slabs to be used for
40 * allocations of any size. Because of this we neither have a small-block
41 * allocator or a big-block allocator and the code paths are simplified.
43 * To support dynamic slab sizing available user virtual memory is broken
44 * down into ~1024 regions. Each region has fixed slab size whos value is
45 * set when the region is opened up for use. The free() path simply applies
46 * a mask based on the region to the pointer to acquire the base of the
47 * governing slab structure.
49 * Regions[NREGIONS] (1024)
51 * Slab management and locking is done on a per-zone basis.
53 * Alloc Size Chunking Number of zones
64 * ... continues forever ... 4 zones
66 * For a 2^63 memory space each doubling >= 64K is broken down into
67 * 4 chunking zones, so we support 88 + (48 * 4) = 280 zones.
69 * API FEATURES AND SIDE EFFECTS
71 * + power-of-2 sized allocations up to a page will be power-of-2 aligned.
72 * Above that power-of-2 sized allocations are page-aligned. Non
73 * power-of-2 sized allocations are aligned the same as the chunk
74 * size for their zone.
75 * + ability to allocate arbitrarily large chunks of memory
76 * + realloc will reuse the passed pointer if possible, within the
77 * limitations of the zone chunking.
79 * On top of the slab allocator we also implement a 16-entry-per-thread
80 * magazine cache for allocations <= NOMSLABSIZE.
84 * + [better] garbage collection
85 * + better initial sizing.
89 * The value of the environment variable MALLOC_OPTIONS is a character string
90 * containing various flags to tune nmalloc. Upper case letters enabled
91 * or increase the feature, lower case disables or decreases the feature.
93 * U Enable UTRACE for all operations, observable with ktrace.
94 * Diasbled by default.
96 * Z Zero out allocations, otherwise allocations (except for
97 * calloc) will contain garbage.
98 * Disabled by default.
100 * H Pass a hint with madvise() about unused pages.
101 * Disabled by default.
102 * Not currently implemented.
104 * F Disable local per-thread caching.
105 * Disabled by default.
107 * C Increase (decrease) how much excess cache to retain.
108 * Set to 4 by default.
111 /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o dmalloc.so dmalloc.c */
113 #ifndef STANDALONE_DEBUG
114 #include "libc_private.h"
117 #include <sys/param.h>
118 #include <sys/types.h>
119 #include <sys/mman.h>
120 #include <sys/queue.h>
122 #include <sys/ktrace.h>
135 #include <machine/atomic.h>
136 #include <machine/cpufunc.h>
138 #ifdef STANDALONE_DEBUG
139 void _nmalloc_thr_init(void);
141 #include "spinlock.h"
142 #include "un-namespace.h"
145 #ifndef MAP_SIZEALIGN
146 #define MAP_SIZEALIGN 0
149 #if SSIZE_MAX == 0x7FFFFFFF
151 #define UVM_BITS 32 /* worst case */
154 #define UVM_BITS 48 /* worst case XXX */
157 #if LONG_MAX == 0x7FFFFFFF
159 #define LONG_BITS_SHIFT 5
162 #define LONG_BITS_SHIFT 6
165 #define LOCKEDPTR ((void *)(intptr_t)-1)
170 #define NREGIONS_BITS 10
171 #define NREGIONS (1 << NREGIONS_BITS)
172 #define NREGIONS_MASK (NREGIONS - 1)
173 #define NREGIONS_SHIFT (UVM_BITS - NREGIONS_BITS)
174 #define NREGIONS_SIZE (1LU << NREGIONS_SHIFT)
176 typedef struct region *region_t;
177 typedef struct slglobaldata *slglobaldata_t;
178 typedef struct slab *slab_t;
182 slab_t slab; /* conditional out of band slab */
185 static struct region Regions[NREGIONS];
188 * Number of chunking zones available
190 #define CHUNKFACTOR 8
192 #define NZONES (16 + 9 * CHUNKFACTOR + 16 * CHUNKFACTOR)
194 #define NZONES (16 + 9 * CHUNKFACTOR + 48 * CHUNKFACTOR)
197 static int MaxChunks[NZONES];
199 #define NDEPOTS 8 /* must be power of 2 */
202 * Maximum number of chunks per slab, governed by the allocation bitmap in
203 * each slab. The maximum is reduced for large chunk sizes.
205 #define MAXCHUNKS (LONG_BITS * LONG_BITS)
206 #define MAXCHUNKS_BITS (LONG_BITS_SHIFT * LONG_BITS_SHIFT)
207 #define LITSLABSIZE (32 * 1024)
208 #define NOMSLABSIZE (2 * 1024 * 1024)
209 #define BIGSLABSIZE (128 * 1024 * 1024)
211 #define ZALLOC_SLAB_MAGIC 0x736c6162 /* magic sanity */
213 TAILQ_HEAD(slab_list, slab);
219 struct slab *next; /* slabs with available space */
220 TAILQ_ENTRY(slab) entry;
221 int32_t magic; /* magic number for sanity check */
222 u_int navail; /* number of free elements available */
224 u_int free_bit; /* free hint bitno */
225 u_int free_index; /* free hint index */
226 u_long bitmap[LONG_BITS]; /* free chunks */
227 size_t slab_size; /* size of entire slab */
228 size_t chunk_size; /* chunk size for validation */
230 enum { UNKNOWN, AVAIL, EMPTY, FULL } state;
232 region_t region; /* related region */
233 char *chunks; /* chunk base */
234 slglobaldata_t slgd; /* localized to thread else NULL */
238 * per-thread data + global depot
240 * NOTE: The magazine shortcut is only used for per-thread data.
242 #define NMAGSHORTCUT 16
244 struct slglobaldata {
245 spinlock_t lock; /* only used by slglobaldepot */
253 void *mag_shortcut[NMAGSHORTCUT];
255 struct slab_list full_zones; /* via entry */
261 #define SLAB_ZEROD 0x0001
264 * Misc constants. Note that allocations that are exact multiples of
265 * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
266 * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
268 #define MIN_CHUNK_SIZE 8 /* in bytes */
269 #define MIN_CHUNK_MASK (MIN_CHUNK_SIZE - 1)
271 #define SAFLAG_ZERO 0x00000001
274 * The WEIRD_ADDR is used as known text to copy into free objects to
275 * try to create deterministic failure cases if the data is accessed after
278 * WARNING: A limited number of spinlocks are available, BIGXSIZE should
279 * not be larger then 64.
282 #define WEIRD_ADDR 0xdeadc0de
289 #define MASSERT(exp) do { if (__predict_false(!(exp))) \
290 _mpanic("assertion: %s in %s", \
295 * With this attribute set, do not require a function call for accessing
296 * this variable when the code is compiled -fPIC.
298 * Must be empty for libc_rtld (similar to __thread)
300 #if defined(__LIBC_RTLD)
301 #define TLS_ATTRIBUTE
303 #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec")));
306 static __thread struct slglobaldata slglobal TLS_ATTRIBUTE;
307 static pthread_key_t thread_malloc_key;
308 static pthread_once_t thread_malloc_once = PTHREAD_ONCE_INIT;
309 static struct slglobaldata slglobaldepot;
311 static int opt_madvise = 0;
312 static int opt_free = 0;
313 static int opt_cache = 4;
314 static int opt_utrace = 0;
315 static int g_malloc_flags = 0;
316 static int malloc_panic;
319 static const int32_t weirdary[16] = {
320 WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
321 WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
322 WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
323 WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR
327 static void *memalloc(size_t size, int flags);
328 static void *memrealloc(void *ptr, size_t size);
329 static void memfree(void *ptr, int);
330 static int memalign(void **memptr, size_t alignment, size_t size);
331 static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size);
332 static void slabfree(slab_t slab);
333 static void slabterm(slglobaldata_t slgd, slab_t slab);
334 static void *_vmem_alloc(int ri, size_t slab_size);
335 static void _vmem_free(void *ptr, size_t slab_size);
336 static void _mpanic(const char *ctl, ...) __printflike(1, 2);
337 #ifndef STANDALONE_DEBUG
338 static void malloc_init(void) __constructor(101);
340 static void malloc_init(void) __constructor(101);
344 struct nmalloc_utrace {
350 #define UTRACE(a, b, c) \
352 struct nmalloc_utrace ut = { \
357 utrace(&ut, sizeof(ut)); \
362 * If enabled any memory allocated without M_ZERO is initialized to -1.
364 static int use_malloc_pattern;
370 const char *p = NULL;
372 TAILQ_INIT(&slglobal.full_zones);
374 Regions[0].mask = -1; /* disallow activity in lowest region */
376 if (issetugid() == 0)
377 p = getenv("MALLOC_OPTIONS");
379 for (; p != NULL && *p != '\0'; p++) {
410 g_malloc_flags = SAFLAG_ZERO;
417 UTRACE((void *) -1, 0, NULL);
421 * We have to install a handler for nmalloc thread teardowns when
422 * the thread is created. We cannot delay this because destructors in
423 * sophisticated userland programs can call malloc() for the first time
424 * during their thread exit.
426 * This routine is called directly from pthreads.
428 static void _nmalloc_thr_init_once(void);
429 static void _nmalloc_thr_destructor(void *thrp);
432 _nmalloc_thr_init(void)
436 TAILQ_INIT(&slglobal.full_zones);
444 pthread_once(&thread_malloc_once, _nmalloc_thr_init_once);
446 pthread_setspecific(thread_malloc_key, &slglobal);
451 _nmalloc_thr_prepfork(void)
454 _SPINLOCK(&slglobaldepot.lock);
458 _nmalloc_thr_parentfork(void)
461 _SPINUNLOCK(&slglobaldepot.lock);
465 _nmalloc_thr_childfork(void)
468 _SPINUNLOCK(&slglobaldepot.lock);
475 _nmalloc_thr_init_once(void)
479 error = pthread_key_create(&thread_malloc_key, _nmalloc_thr_destructor);
485 * Called for each thread undergoing exit
487 * Move all of the thread's slabs into a depot.
490 _nmalloc_thr_destructor(void *thrp)
492 slglobaldata_t slgd = thrp;
493 struct zoneinfo *zinfo;
501 for (i = 0; i <= slgd->biggest_index; i++) {
502 zinfo = &slgd->zone[i];
504 while ((j = zinfo->mag_index) > 0) {
506 ptr = zinfo->mag_shortcut[j];
507 zinfo->mag_shortcut[j] = NULL; /* SAFETY */
508 zinfo->mag_index = j;
512 while ((slab = zinfo->empty_base) != NULL) {
513 zinfo->empty_base = slab->next;
514 --zinfo->empty_count;
515 slabterm(slgd, slab);
518 while ((slab = zinfo->avail_base) != NULL) {
519 zinfo->avail_base = slab->next;
520 --zinfo->avail_count;
521 slabterm(slgd, slab);
524 while ((slab = TAILQ_FIRST(&slgd->full_zones)) != NULL) {
525 TAILQ_REMOVE(&slgd->full_zones, slab, entry);
526 slabterm(slgd, slab);
532 * Calculate the zone index for the allocation request size and set the
533 * allocation request size to that particular zone's chunk size.
535 * Minimum alignment is 16 bytes for allocations >= 16 bytes to conform
536 * with malloc requirements for intel/amd.
539 zoneindex(size_t *bytes, size_t *chunking)
541 size_t n = (size_t)*bytes;
548 *bytes = n = (n + 7) & ~7;
550 return(n / 8 - 1); /* 8 byte chunks, 2 zones */
552 *bytes = n = (n + 15) & ~15;
554 return(n / 16 + 2); /* 16 byte chunks, 8 zones */
559 c = x / (CHUNKFACTOR * 2);
563 c = x / (CHUNKFACTOR * 2);
564 i = 16 + CHUNKFACTOR * 5; /* 256->512,1024,2048,4096,8192 */
571 _mpanic("slaballoc: byte value too high");
573 *bytes = n = roundup2(n, c);
575 return (i + n / c - CHUNKFACTOR);
577 *bytes = n = (n + c - 1) & ~(c - 1);
582 *bytes = n = (n + 15) & ~15;
584 return(n / (CHUNKINGLO*2) + CHUNKINGLO*1 - 1);
588 *bytes = n = (n + 31) & ~31;
590 return(n / (CHUNKINGLO*4) + CHUNKINGLO*2 - 1);
593 *bytes = n = (n + 63) & ~63;
595 return(n / (CHUNKINGLO*8) + CHUNKINGLO*3 - 1);
598 *bytes = n = (n + 127) & ~127;
600 return(n / (CHUNKINGLO*16) + CHUNKINGLO*4 - 1);
603 *bytes = n = (n + 255) & ~255;
605 return(n / (CHUNKINGLO*32) + CHUNKINGLO*5 - 1);
607 *bytes = n = (n + 511) & ~511;
609 return(n / (CHUNKINGLO*64) + CHUNKINGLO*6 - 1);
612 *bytes = n = (n + 1023) & ~1023;
614 return(n / (CHUNKINGLO*128) + CHUNKINGLO*7 - 1);
616 if (n < 32768) { /* 16384-32767 */
617 *bytes = n = (n + 2047) & ~2047;
619 return(n / (CHUNKINGLO*256) + CHUNKINGLO*8 - 1);
622 *bytes = n = (n + 4095) & ~4095; /* 32768-65535 */
624 return(n / (CHUNKINGLO*512) + CHUNKINGLO*9 - 1);
629 i = CHUNKINGLO*10 - 1;
636 _mpanic("slaballoc: byte value too high");
638 *bytes = n = (n + c - 1) & ~(c - 1);
645 * malloc() - call internal slab allocator
648 __malloc(size_t size)
652 ptr = memalloc(size, 0);
656 UTRACE(0, size, ptr);
661 * calloc() - call internal slab allocator
664 __calloc(size_t number, size_t size)
668 ptr = memalloc(number * size, SAFLAG_ZERO);
672 UTRACE(0, number * size, ptr);
677 * realloc() (SLAB ALLOCATOR)
679 * We do not attempt to optimize this routine beyond reusing the same
680 * pointer if the new size fits within the chunking of the old pointer's
684 __realloc(void *ptr, size_t size)
689 ret = memalloc(size, 0);
691 ret = memrealloc(ptr, size);
695 UTRACE(ptr, size, ret);
702 * Allocate (size) bytes with a alignment of (alignment).
705 __aligned_alloc(size_t alignment, size_t size)
711 rc = memalign(&ptr, alignment, size);
721 * Allocate (size) bytes with a alignment of (alignment), where (alignment)
722 * is a power of 2 >= sizeof(void *).
725 __posix_memalign(void **memptr, size_t alignment, size_t size)
730 * OpenGroup spec issue 6 check
732 if (alignment < sizeof(void *)) {
737 rc = memalign(memptr, alignment, size);
743 * The slab allocator will allocate on power-of-2 boundaries up to at least
744 * PAGE_SIZE. Otherwise we use the zoneindex mechanic to find a zone
745 * matching the requirements.
748 memalign(void **memptr, size_t alignment, size_t size)
757 * OpenGroup spec issue 6 check
759 if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) {
765 * XXX for now just find the nearest power of 2 >= size and also
766 * >= alignment and allocate that.
768 while (alignment < size) {
771 _mpanic("posix_memalign: byte value too high");
773 *memptr = memalloc(alignment, 0);
774 return(*memptr ? 0 : ENOMEM);
778 * free() (SLAB ALLOCATOR) - do the obvious
790 * memalloc() (SLAB ALLOCATOR)
792 * Allocate memory via the slab allocator.
795 memalloc(size_t size, int flags)
798 struct zoneinfo *zinfo;
812 * If 0 bytes is requested we have to return a unique pointer, allocate
818 /* Capture global flags */
819 flags |= g_malloc_flags;
821 /* Compute allocation zone; zoneindex will panic on excessive sizes */
822 zi = zoneindex(&size, &chunking);
823 MASSERT(zi < NZONES);
828 * Try magazine shortcut first
831 zinfo = &slgd->zone[zi];
833 if ((j = zinfo->mag_index) != 0) {
834 zinfo->mag_index = --j;
835 obj = zinfo->mag_shortcut[j];
836 zinfo->mag_shortcut[j] = NULL; /* SAFETY */
837 if (flags & SAFLAG_ZERO)
843 * Locate a slab with available space. If no slabs are available
844 * back-off to the empty list and if we still come up dry allocate
845 * a new slab (which will try the depot first).
848 if ((slab = zinfo->avail_base) == NULL) {
849 if ((slab = zinfo->empty_base) == NULL) {
853 slab = slaballoc(zi, chunking, size);
856 slab->next = zinfo->avail_base;
857 zinfo->avail_base = slab;
858 ++zinfo->avail_count;
860 if (slgd->biggest_index < zi)
861 slgd->biggest_index = zi;
865 * Pulled from empty list
867 zinfo->empty_base = slab->next;
868 slab->next = zinfo->avail_base;
869 zinfo->avail_base = slab;
870 ++zinfo->avail_count;
872 --zinfo->empty_count;
877 * Allocate a chunk out of the slab. HOT PATH
879 * Only the thread owning the slab can allocate out of it.
881 * NOTE: The last bit in the bitmap is always marked allocated so
882 * we cannot overflow here.
884 bno = slab->free_bit;
885 bmi = slab->free_index;
886 bmp = &slab->bitmap[bmi];
887 if (*bmp & (1LU << bno)) {
888 atomic_clear_long(bmp, 1LU << bno);
889 obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size;
890 slab->free_bit = (bno + 1) & (LONG_BITS - 1);
891 atomic_add_int(&slab->navail, -1);
892 if (flags & SAFLAG_ZERO)
898 * Allocate a chunk out of a slab. COLD PATH
900 if (slab->navail == 0) {
901 zinfo->avail_base = slab->next;
902 --zinfo->avail_count;
904 TAILQ_INSERT_TAIL(&slgd->full_zones, slab, entry);
908 while (bmi < LONG_BITS) {
909 bmp = &slab->bitmap[bmi];
912 atomic_clear_long(bmp, 1LU << bno);
913 obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
915 slab->free_index = bmi;
916 slab->free_bit = (bno + 1) & (LONG_BITS - 1);
917 atomic_add_int(&slab->navail, -1);
918 if (flags & SAFLAG_ZERO)
925 while (bmi < LONG_BITS) {
926 bmp = &slab->bitmap[bmi];
929 atomic_clear_long(bmp, 1LU << bno);
930 obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
932 slab->free_index = bmi;
933 slab->free_bit = (bno + 1) & (LONG_BITS - 1);
934 atomic_add_int(&slab->navail, -1);
935 if (flags & SAFLAG_ZERO)
941 _mpanic("slaballoc: corrupted zone: navail %d", slab->navail);
947 * Reallocate memory within the chunk
950 memrealloc(void *ptr, size_t nsize)
959 * If 0 bytes is requested we have to return a unique pointer, allocate
965 /* Capture global flags */
966 flags |= g_malloc_flags;
969 * Locate the zone by looking up the dynamic slab size mask based
970 * on the memory region the allocation resides in.
972 region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
973 if ((slab = region->slab) == NULL)
974 slab = (void *)((uintptr_t)ptr & region->mask);
975 MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
976 osize = slab->chunk_size;
977 if (nsize <= osize) {
978 if (osize < 32 || nsize >= osize / 2) {
980 if ((flags & SAFLAG_ZERO) && nsize < osize)
981 bzero(obj + nsize, osize - nsize);
987 * Otherwise resize the object
989 obj = memalloc(nsize, 0);
993 bcopy(ptr, obj, nsize);
1000 * free (SLAB ALLOCATOR)
1002 * Free a memory block previously allocated by malloc.
1007 memfree(void *ptr, int flags)
1010 slglobaldata_t slgd;
1020 * Locate the zone by looking up the dynamic slab size mask based
1021 * on the memory region the allocation resides in.
1023 * WARNING! The slab may be owned by another thread!
1025 region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
1026 if ((slab = region->slab) == NULL)
1027 slab = (void *)((uintptr_t)ptr & region->mask);
1028 MASSERT(slab != NULL);
1029 MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
1033 * Put weird data into the memory to detect modifications after
1034 * freeing, illegal pointer use after freeing (we should fault on
1035 * the odd address), and so forth.
1037 if (slab->chunk_size < sizeof(weirdary))
1038 bcopy(weirdary, ptr, slab->chunk_size);
1040 bcopy(weirdary, ptr, sizeof(weirdary));
1045 * Use mag_shortcut[] when possible
1047 if (slgd->masked == 0 && slab->chunk_size <= NOMSLABSIZE) {
1048 struct zoneinfo *zinfo;
1050 zinfo = &slgd->zone[slab->zone_index];
1051 j = zinfo->mag_index;
1052 if (j < NMAGSHORTCUT) {
1053 zinfo->mag_shortcut[j] = ptr;
1054 zinfo->mag_index = j + 1;
1060 * Free to slab and increment navail. We can delay incrementing
1061 * navail to prevent the slab from being destroyed out from under
1062 * us while we do other optimizations.
1064 bno = ((uintptr_t)ptr - (uintptr_t)slab->chunks) / slab->chunk_size;
1065 bmi = bno >> LONG_BITS_SHIFT;
1066 bno &= (LONG_BITS - 1);
1067 bmp = &slab->bitmap[bmi];
1069 MASSERT(bmi >= 0 && bmi < slab->nmax);
1070 MASSERT((*bmp & (1LU << bno)) == 0);
1071 atomic_set_long(bmp, 1LU << bno);
1073 if (slab->slgd == slgd) {
1075 * We can only do the following if we own the slab. Note
1076 * that navail can be incremented by any thread even if
1079 struct zoneinfo *zinfo;
1081 atomic_add_int(&slab->navail, 1);
1082 if (slab->free_index > bmi) {
1083 slab->free_index = bmi;
1084 slab->free_bit = bno;
1085 } else if (slab->free_index == bmi &&
1086 slab->free_bit > bno) {
1087 slab->free_bit = bno;
1089 zinfo = &slgd->zone[slab->zone_index];
1092 * Freeing an object from a full slab makes it less than
1093 * full. The slab must be moved to the available list.
1095 * If the available list has too many slabs, release some
1098 if (slab->state == FULL) {
1099 TAILQ_REMOVE(&slgd->full_zones, slab, entry);
1100 slab->state = AVAIL;
1101 stmp = zinfo->avail_base;
1103 zinfo->avail_base = slab;
1104 ++zinfo->avail_count;
1105 while (zinfo->avail_count > opt_cache) {
1106 slab = zinfo->avail_base;
1107 zinfo->avail_base = slab->next;
1108 --zinfo->avail_count;
1109 slabterm(slgd, slab);
1115 * If the slab becomes completely empty dispose of it in
1116 * some manner. By default each thread caches up to 4
1117 * empty slabs. Only small slabs are cached.
1119 if (slab->navail == slab->nmax && slab->state == AVAIL) {
1121 * Remove slab from available queue
1123 slabp = &zinfo->avail_base;
1124 while ((stmp = *slabp) != slab)
1125 slabp = &stmp->next;
1126 *slabp = slab->next;
1127 --zinfo->avail_count;
1129 if (opt_free || opt_cache == 0) {
1131 * If local caching is disabled cache the
1132 * slab in the depot (or free it).
1134 slabterm(slgd, slab);
1135 } else if (slab->slab_size > BIGSLABSIZE) {
1137 * We do not try to retain large slabs
1138 * in per-thread caches.
1140 slabterm(slgd, slab);
1141 } else if (zinfo->empty_count > opt_cache) {
1143 * We have too many slabs cached, but
1144 * instead of freeing this one free
1145 * an empty slab that's been idle longer.
1147 * (empty_count does not change)
1149 stmp = zinfo->empty_base;
1150 slab->state = EMPTY;
1151 slab->next = stmp->next;
1152 zinfo->empty_base = slab;
1153 slabterm(slgd, stmp);
1156 * Cache the empty slab in our thread local
1159 ++zinfo->empty_count;
1160 slab->state = EMPTY;
1161 slab->next = zinfo->empty_base;
1162 zinfo->empty_base = slab;
1165 } else if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1166 slglobaldata_t sldepot;
1169 * If freeing to a slab owned by the global depot, and
1170 * the slab becomes completely EMPTY, try to move it to
1173 sldepot = &slglobaldepot;
1175 _SPINLOCK(&sldepot->lock);
1176 if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1177 struct zoneinfo *zinfo;
1180 * Move the slab to the empty list
1182 MASSERT(slab->state == AVAIL);
1183 atomic_add_int(&slab->navail, 1);
1184 zinfo = &sldepot->zone[slab->zone_index];
1185 slabp = &zinfo->avail_base;
1186 while (slab != *slabp)
1187 slabp = &(*slabp)->next;
1188 *slabp = slab->next;
1189 --zinfo->avail_count;
1192 * Clean out excessive empty entries from the
1195 slab->state = EMPTY;
1196 slab->next = zinfo->empty_base;
1197 zinfo->empty_base = slab;
1198 ++zinfo->empty_count;
1199 while (zinfo->empty_count > opt_cache) {
1200 slab = zinfo->empty_base;
1201 zinfo->empty_base = slab->next;
1202 --zinfo->empty_count;
1203 slab->state = UNKNOWN;
1205 _SPINUNLOCK(&sldepot->lock);
1208 _SPINLOCK(&sldepot->lock);
1211 atomic_add_int(&slab->navail, 1);
1214 _SPINUNLOCK(&sldepot->lock);
1217 * We can't act on the slab other than by adjusting navail
1218 * (and the bitmap which we did in the common code at the
1221 atomic_add_int(&slab->navail, 1);
1228 * Allocate a new slab holding objects of size chunk_size.
1231 slaballoc(int zi, size_t chunking, size_t chunk_size)
1233 slglobaldata_t slgd;
1234 slglobaldata_t sldepot;
1235 struct zoneinfo *zinfo;
1242 uintptr_t chunk_offset;
1253 * First look in the depot. Any given zone in the depot may be
1254 * locked by being set to -1. We have to do this instead of simply
1255 * removing the entire chain because removing the entire chain can
1256 * cause racing threads to allocate local slabs for large objects,
1257 * resulting in a large VSZ.
1260 sldepot = &slglobaldepot;
1261 zinfo = &sldepot->zone[zi];
1263 if (zinfo->avail_base) {
1265 _SPINLOCK(&sldepot->lock);
1266 slab = zinfo->avail_base;
1268 MASSERT(slab->slgd == NULL);
1270 zinfo->avail_base = slab->next;
1271 --zinfo->avail_count;
1273 _SPINUNLOCK(&sldepot->lock);
1277 _SPINUNLOCK(&sldepot->lock);
1281 * Nothing in the depot, allocate a new slab by locating or assigning
1282 * a region and then using the system virtual memory allocator.
1287 * Calculate the start of the data chunks relative to the start
1288 * of the slab. If chunk_size is a power of 2 we guarantee
1289 * power of 2 alignment. If it is not we guarantee alignment
1290 * to the chunk size.
1292 if ((chunk_size ^ (chunk_size - 1)) == (chunk_size << 1) - 1) {
1294 chunk_offset = roundup2(sizeof(*slab), chunk_size);
1297 chunk_offset = sizeof(*slab) + chunking - 1;
1298 chunk_offset -= chunk_offset % chunking;
1302 * Calculate a reasonable number of chunks for the slab.
1304 * Once initialized the MaxChunks[] array can only ever be
1305 * reinitialized to the same value.
1307 maxchunks = MaxChunks[zi];
1308 if (maxchunks == 0) {
1310 * First calculate how many chunks would fit in 1/1024
1311 * available memory. This is around 2MB on a 32 bit
1312 * system and 128G on a 64-bit (48-bits available) system.
1314 maxchunks = (ssize_t)(NREGIONS_SIZE - chunk_offset) /
1315 (ssize_t)chunk_size;
1320 * A slab cannot handle more than MAXCHUNKS chunks, but
1321 * limit us to approximately MAXCHUNKS / 2 here because
1322 * we may have to expand maxchunks when we calculate the
1323 * actual power-of-2 slab.
1325 if (maxchunks > MAXCHUNKS / 2)
1326 maxchunks = MAXCHUNKS / 2;
1329 * Try to limit the slabs to BIGSLABSIZE (~128MB). Larger
1330 * slabs will be created if the allocation does not fit.
1332 if (chunk_offset + chunk_size * maxchunks > BIGSLABSIZE) {
1333 tmpchunks = (ssize_t)(BIGSLABSIZE - chunk_offset) /
1334 (ssize_t)chunk_size;
1337 if (maxchunks > tmpchunks)
1338 maxchunks = tmpchunks;
1342 * If the slab calculates to greater than 2MB see if we
1343 * can cut it down to ~2MB. This controls VSZ but has
1344 * no effect on run-time size or performance.
1346 * This is very important in case you core dump and also
1347 * important to reduce unnecessary region allocations.
1349 if (chunk_offset + chunk_size * maxchunks > NOMSLABSIZE) {
1350 tmpchunks = (ssize_t)(NOMSLABSIZE - chunk_offset) /
1351 (ssize_t)chunk_size;
1354 if (maxchunks > tmpchunks)
1355 maxchunks = tmpchunks;
1359 * If the slab calculates to greater than 128K see if we
1360 * can cut it down to ~128K while still maintaining a
1361 * reasonably large number of chunks in each slab. This
1362 * controls VSZ but has no effect on run-time size or
1365 * This is very important in case you core dump and also
1366 * important to reduce unnecessary region allocations.
1368 if (chunk_offset + chunk_size * maxchunks > LITSLABSIZE) {
1369 tmpchunks = (ssize_t)(LITSLABSIZE - chunk_offset) /
1370 (ssize_t)chunk_size;
1373 if (maxchunks > tmpchunks)
1374 maxchunks = tmpchunks;
1377 MaxChunks[zi] = maxchunks;
1379 MASSERT(maxchunks > 0 && maxchunks <= MAXCHUNKS);
1382 * Calculate the actual slab size. maxchunks will be recalculated
1385 slab_desire = chunk_offset + chunk_size * maxchunks;
1386 slab_size = 8 * MAXCHUNKS;
1387 power = 3 + MAXCHUNKS_BITS;
1388 while (slab_size < slab_desire) {
1394 * Do a quick recalculation based on the actual slab size but not
1395 * yet dealing with whether the slab header is in-band or out-of-band.
1396 * The purpose here is to see if we can reasonably reduce slab_size
1397 * to a power of 4 to allow more chunk sizes to use the same slab
1400 if ((power & 1) && slab_size > 32768) {
1401 maxchunks = (slab_size - chunk_offset) / chunk_size;
1402 if (maxchunks >= MAXCHUNKS / 8) {
1407 if ((power & 2) && slab_size > 32768 * 4) {
1408 maxchunks = (slab_size - chunk_offset) / chunk_size;
1409 if (maxchunks >= MAXCHUNKS / 4) {
1415 * This case occurs when the slab_size is larger than 1/1024 available
1418 nswath = slab_size / NREGIONS_SIZE;
1419 if (nswath > NREGIONS)
1424 * Try to allocate from our current best region for this zi
1426 region_mask = ~(slab_size - 1);
1427 ri = slgd->zone[zi].best_region;
1428 if (Regions[ri].mask == region_mask) {
1429 if ((slab = _vmem_alloc(ri, slab_size)) != NULL)
1434 * Try to find an existing region to allocate from. The normal
1435 * case will be for allocations that are less than 1/1024 available
1436 * UVM, which fit into a single Regions[] entry.
1438 while (slab_size <= NREGIONS_SIZE) {
1440 for (ri = 0; ri < NREGIONS; ++ri) {
1441 if (rx < 0 && Regions[ri].mask == 0)
1443 if (Regions[ri].mask == region_mask) {
1444 slab = _vmem_alloc(ri, slab_size);
1446 slgd->zone[zi].best_region = ri;
1456 * This can fail, retry either way
1458 atomic_cmpset_ptr((void **)&Regions[rx].mask,
1460 (void *)region_mask);
1465 for (ri = 0; ri < NREGIONS; ri += nswath) {
1466 if (Regions[ri].mask == region_mask) {
1467 slab = _vmem_alloc(ri, slab_size);
1469 slgd->zone[zi].best_region = ri;
1474 for (j = nswath - 1; j >= 0; --j) {
1475 if (Regions[ri+j].mask != 0)
1484 * We found a candidate, try to allocate it backwards so
1485 * another thread racing a slaballoc() does not see the
1486 * mask in the base index position until we are done.
1488 * We can safely zero-out any partial allocations because
1489 * the mask is only accessed from the base index. Any other
1490 * threads racing us will fail prior to us clearing the mask.
1494 for (j = nswath - 1; j >= 0; --j) {
1495 if (!atomic_cmpset_ptr((void **)&Regions[rx+j].mask,
1496 NULL, (void *)region_mask)) {
1497 while (++j < nswath)
1498 Regions[rx+j].mask = 0;
1506 * Fill in the new slab in region ri. If the slab_size completely
1507 * fills one or more region slots we move the slab structure out of
1508 * band which should optimize the chunking (particularly for a power
1512 region = &Regions[ri];
1513 MASSERT(region->slab == NULL);
1514 if (slab_size >= NREGIONS_SIZE) {
1516 slab = memalloc(sizeof(*slab), 0);
1517 bzero(slab, sizeof(*slab));
1518 slab->chunks = save;
1519 for (j = 0; j < nswath; ++j)
1520 region[j].slab = slab;
1523 bzero(slab, sizeof(*slab));
1524 slab->chunks = (char *)slab + chunk_offset;
1528 * Calculate the start of the chunks memory and recalculate the
1529 * actual number of chunks the slab can hold.
1531 maxchunks = (slab_size - chunk_offset) / chunk_size;
1532 if (maxchunks > MAXCHUNKS)
1533 maxchunks = MAXCHUNKS;
1536 * And fill in the rest
1538 slab->magic = ZALLOC_SLAB_MAGIC;
1539 slab->navail = maxchunks;
1540 slab->nmax = maxchunks;
1541 slab->slab_size = slab_size;
1542 slab->chunk_size = chunk_size;
1543 slab->zone_index = zi;
1545 slab->state = UNKNOWN;
1546 slab->region = region;
1548 for (ri = 0; ri < maxchunks; ri += LONG_BITS) {
1549 if (ri + LONG_BITS <= maxchunks)
1550 slab->bitmap[ri >> LONG_BITS_SHIFT] = ULONG_MAX;
1552 slab->bitmap[ri >> LONG_BITS_SHIFT] =
1553 (1LU << (maxchunks - ri)) - 1;
1562 slabfree(slab_t slab)
1567 if (slab->region->slab == slab) {
1571 nswath = slab->slab_size / NREGIONS_SIZE;
1572 for (j = 0; j < nswath; ++j)
1573 slab->region[j].slab = NULL;
1575 _vmem_free(slab->chunks, slab->slab_size);
1582 _vmem_free(slab, slab->slab_size);
1587 * Terminate a slab's use in the current thread. The slab may still have
1588 * outstanding allocations and thus not be deallocatable.
1591 slabterm(slglobaldata_t slgd, slab_t slab)
1593 slglobaldata_t sldepot;
1594 struct zoneinfo *zinfo;
1595 int zi = slab->zone_index;
1599 sldepot = &slglobaldepot;
1600 zinfo = &sldepot->zone[zi];
1603 * Move the slab to the avail list or the empty list.
1606 _SPINLOCK(&sldepot->lock);
1607 if (slab->navail == slab->nmax) {
1608 slab->state = EMPTY;
1609 slab->next = zinfo->empty_base;
1610 zinfo->empty_base = slab;
1611 ++zinfo->empty_count;
1613 slab->state = AVAIL;
1614 slab->next = zinfo->avail_base;
1615 zinfo->avail_base = slab;
1616 ++zinfo->avail_count;
1620 * Clean extra slabs out of the empty list
1622 while (zinfo->empty_count > opt_cache) {
1623 slab = zinfo->empty_base;
1624 zinfo->empty_base = slab->next;
1625 --zinfo->empty_count;
1626 slab->state = UNKNOWN;
1628 _SPINUNLOCK(&sldepot->lock);
1631 _SPINLOCK(&sldepot->lock);
1634 _SPINUNLOCK(&sldepot->lock);
1640 * Directly map memory in PAGE_SIZE'd chunks with the specified
1643 * Alignment must be a multiple of PAGE_SIZE.
1645 * Size must be >= alignment.
1648 _vmem_alloc(int ri, size_t slab_size)
1650 char *baddr = (void *)((uintptr_t)ri << NREGIONS_SHIFT);
1656 if (slab_size < NREGIONS_SIZE)
1657 eaddr = baddr + NREGIONS_SIZE;
1659 eaddr = baddr + slab_size;
1662 * This usually just works but might not.
1664 addr = mmap(baddr, slab_size, PROT_READ|PROT_WRITE,
1665 MAP_PRIVATE | MAP_ANON | MAP_SIZEALIGN, -1, 0);
1666 if (addr == MAP_FAILED) {
1669 if (addr < baddr || addr + slab_size > eaddr) {
1670 munmap(addr, slab_size);
1675 * Check alignment. The misaligned offset is also the excess
1676 * amount. If misaligned unmap the excess so we have a chance of
1677 * mapping at the next alignment point and recursively try again.
1679 * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB block alignment
1680 * aaaaaaaaa aaaaaaaaaaa aa mis-aligned allocation
1681 * xxxxxxxxx final excess calculation
1682 * ^ returned address
1684 excess = (uintptr_t)addr & (slab_size - 1);
1686 excess = slab_size - excess;
1689 munmap(save + excess, slab_size - excess);
1690 addr = _vmem_alloc(ri, slab_size);
1691 munmap(save, excess);
1694 if (addr < baddr || addr + slab_size > eaddr) {
1695 munmap(addr, slab_size);
1698 excess = (uintptr_t)addr & (slab_size - 1);
1706 * Free a chunk of memory allocated with _vmem_alloc()
1709 _vmem_free(void *ptr, size_t size)
1715 * Panic on fatal conditions
1718 _mpanic(const char *ctl, ...)
1722 if (malloc_panic == 0) {
1725 vfprintf(stderr, ctl, va);
1726 fprintf(stderr, "\n");
1733 __weak_reference(__aligned_alloc, aligned_alloc);
1734 __weak_reference(__malloc, malloc);
1735 __weak_reference(__calloc, calloc);
1736 __weak_reference(__posix_memalign, posix_memalign);
1737 __weak_reference(__realloc, realloc);
1738 __weak_reference(__free, free);