2 * DMALLOC.C - Dillon's malloc
4 * Copyright (c) 2011 The DragonFly Project. All rights reserved.
6 * This code is derived from software contributed to The DragonFly Project
7 * by Matthew Dillon <dillon@backplane.com>.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
19 * 3. Neither the name of The DragonFly Project nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific, prior written permission.
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * This module implements a modified slab allocator drop-in replacement for
38 * the libc malloc(). The slab algorithm has been adjusted to support dynamic
39 * sizing of slabs which effectively allows slabs to be used for allocations of
40 * any size. Because of this we neither have a small-block allocator or a
41 * big-block allocator and the code paths are simplified to the point where
42 * allocations, caching, and freeing, is screaming fast.
44 * There is very little interaction between threads. A global depot accessed
45 * via atomic cmpxchg instructions (only! no spinlocks!) is used as a
46 * catch-all and to deal with thread exits and such.
48 * To support dynamic slab sizing available user virtual memory is broken
49 * down into ~1024 regions. Each region has fixed slab size whos value is
50 * set when the region is opened up for use. The free() path simply applies
51 * a mask based on the region to the pointer to acquire the base of the
52 * governing slab structure.
54 * Regions[NREGIONS] (1024)
56 * Slab management and locking is done on a per-zone basis.
58 * Alloc Size Chunking Number of zones
69 * ... continues unlimited ... 4 zones
71 * For a 2^63 memory space each doubling >= 64K is broken down into
72 * 4 chunking zones, so we support 88 + (48 * 4) = 280 zones.
74 * API FEATURES AND SIDE EFFECTS
76 * + power-of-2 sized allocations up to a page will be power-of-2 aligned.
77 * Above that power-of-2 sized allocations are page-aligned. Non
78 * power-of-2 sized allocations are aligned the same as the chunk
79 * size for their zone.
80 * + malloc(0) returns a special non-NULL value
81 * + ability to allocate arbitrarily large chunks of memory
82 * + realloc will reuse the passed pointer if possible, within the
83 * limitations of the zone chunking.
87 * + [better] garbage collection
88 * + better initial sizing.
92 * The value of the environment variable MALLOC_OPTIONS is a character string
93 * containing various flags to tune nmalloc. Upper case letters enabled
94 * or increase the feature, lower case disables or decreases the feature.
96 * U Enable UTRACE for all operations, observable with ktrace.
97 * Diasbled by default.
99 * Z Zero out allocations, otherwise allocations (except for
100 * calloc) will contain garbage.
101 * Disabled by default.
103 * H Pass a hint with madvise() about unused pages.
104 * Disabled by default.
105 * Not currently implemented.
107 * F Disable local per-thread caching.
108 * Disabled by default.
110 * C Increase (decrease) how much excess cache to retain.
111 * Set to 4 by default.
114 /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o dmalloc.so dmalloc.c */
116 #ifndef STANDALONE_DEBUG
117 #include "libc_private.h"
120 #include <sys/param.h>
121 #include <sys/types.h>
122 #include <sys/mman.h>
123 #include <sys/queue.h>
125 #include <sys/ktrace.h>
138 #include <machine/atomic.h>
139 #include <machine/cpufunc.h>
141 #ifdef STANDALONE_DEBUG
142 void _nmalloc_thr_init(void);
144 #include "spinlock.h"
145 #include "un-namespace.h"
148 #ifndef MAP_SIZEALIGN
149 #define MAP_SIZEALIGN 0
152 #if SSIZE_MAX == 0x7FFFFFFF
154 #define UVM_BITS 32 /* worst case */
157 #define UVM_BITS 48 /* worst case XXX */
160 #if LONG_MAX == 0x7FFFFFFF
162 #define LONG_BITS_SHIFT 5
165 #define LONG_BITS_SHIFT 6
168 #define LOCKEDPTR ((void *)(intptr_t)-1)
173 #define NREGIONS_BITS 10
174 #define NREGIONS (1 << NREGIONS_BITS)
175 #define NREGIONS_MASK (NREGIONS - 1)
176 #define NREGIONS_SHIFT (UVM_BITS - NREGIONS_BITS)
177 #define NREGIONS_SIZE (1LU << NREGIONS_SHIFT)
179 typedef struct region *region_t;
180 typedef struct slglobaldata *slglobaldata_t;
181 typedef struct slab *slab_t;
185 slab_t slab; /* conditional out of band slab */
188 static struct region Regions[NREGIONS];
191 * Number of chunking zones available
193 #define CHUNKFACTOR 8
195 #define NZONES (16 + 9 * CHUNKFACTOR + 16 * CHUNKFACTOR)
197 #define NZONES (16 + 9 * CHUNKFACTOR + 48 * CHUNKFACTOR)
200 static int MaxChunks[NZONES];
202 #define NDEPOTS 8 /* must be power of 2 */
205 * Maximum number of chunks per slab, governed by the allocation bitmap in
206 * each slab. The maximum is reduced for large chunk sizes.
208 #define MAXCHUNKS (LONG_BITS * LONG_BITS)
209 #define MAXCHUNKS_BITS (LONG_BITS_SHIFT * LONG_BITS_SHIFT)
210 #define LITSLABSIZE (32 * 1024)
211 #define NOMSLABSIZE (2 * 1024 * 1024)
212 #define BIGSLABSIZE (128 * 1024 * 1024)
214 #define ZALLOC_SLAB_MAGIC 0x736c6162 /* magic sanity */
216 TAILQ_HEAD(slab_list, slab);
222 struct slab *next; /* slabs with available space */
223 TAILQ_ENTRY(slab) entry;
224 int32_t magic; /* magic number for sanity check */
225 u_int navail; /* number of free elements available */
227 u_int free_bit; /* free hint bitno */
228 u_int free_index; /* free hint index */
229 u_long bitmap[LONG_BITS]; /* free chunks */
230 size_t slab_size; /* size of entire slab */
231 size_t chunk_size; /* chunk size for validation */
233 enum { UNKNOWN, AVAIL, EMPTY, FULL } state;
235 region_t region; /* related region */
236 char *chunks; /* chunk base */
243 struct slglobaldata {
250 struct slab_list full_zones; /* via entry */
254 slglobaldata_t sldepot;
257 #define SLAB_ZEROD 0x0001
260 * Misc constants. Note that allocations that are exact multiples of
261 * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
262 * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
264 #define MIN_CHUNK_SIZE 8 /* in bytes */
265 #define MIN_CHUNK_MASK (MIN_CHUNK_SIZE - 1)
267 #define SAFLAG_ZERO 0x00000001
270 * The WEIRD_ADDR is used as known text to copy into free objects to
271 * try to create deterministic failure cases if the data is accessed after
274 * WARNING: A limited number of spinlocks are available, BIGXSIZE should
275 * not be larger then 64.
277 #define WEIRD_ADDR 0xdeadc0de
278 #define MAX_COPY sizeof(weirdary)
280 #define arysize(ary) (sizeof(ary)/sizeof((ary)[0]))
286 #define MASSERT(exp) do { if (__predict_false(!(exp))) \
287 _mpanic("assertion: %s in %s", \
291 /* With this attribute set, do not require a function call for accessing
292 * this variable when the code is compiled -fPIC */
293 #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec")));
295 static __thread struct slglobaldata slglobal TLS_ATTRIBUTE;
296 static pthread_key_t thread_malloc_key;
297 static pthread_once_t thread_malloc_once = PTHREAD_ONCE_INIT;
298 static struct slglobaldata sldepots[NDEPOTS];
300 static int opt_madvise = 0;
301 static int opt_free = 0;
302 static int opt_cache = 4;
303 static int opt_utrace = 0;
304 static int g_malloc_flags = 0;
305 static int malloc_panic;
306 static int malloc_started = 0;
308 static const int32_t weirdary[16] = {
309 WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
310 WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
311 WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
312 WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR
315 static void *memalloc(size_t size, int flags);
316 static void *memrealloc(void *ptr, size_t size);
317 static void memfree(void *ptr, int);
318 static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size);
319 static void slabfree(slab_t slab);
320 static void slabterm(slglobaldata_t slgd, slab_t slab);
321 static void *_vmem_alloc(int ri, size_t slab_size);
322 static void _vmem_free(void *ptr, size_t slab_size);
323 static void _mpanic(const char *ctl, ...) __printflike(1, 2);
324 #ifndef STANDALONE_DEBUG
325 static void malloc_init(void) __constructor(101);
327 static void malloc_init(void) __constructor(101);
331 struct nmalloc_utrace {
337 #define UTRACE(a, b, c) \
339 struct nmalloc_utrace ut = { \
344 utrace(&ut, sizeof(ut)); \
349 * If enabled any memory allocated without M_ZERO is initialized to -1.
351 static int use_malloc_pattern;
357 const char *p = NULL;
358 static spinlock_t malloc_init_lock;
364 _SPINLOCK(&malloc_init_lock);
365 if (malloc_started) {
366 _SPINUNLOCK(&malloc_init_lock);
371 Regions[0].mask = -1; /* disallow activity in lowest region */
373 if (issetugid() == 0)
374 p = getenv("MALLOC_OPTIONS");
376 for (; p != NULL && *p != '\0'; p++) {
407 g_malloc_flags = SAFLAG_ZERO;
414 UTRACE((void *) -1, 0, NULL);
419 _SPINUNLOCK(&malloc_init_lock);
423 * We have to install a handler for nmalloc thread teardowns when
424 * the thread is created. We cannot delay this because destructors in
425 * sophisticated userland programs can call malloc() for the first time
426 * during their thread exit.
428 * This routine is called directly from pthreads.
430 static void _nmalloc_thr_init_once(void);
431 static void _nmalloc_thr_destructor(void *thrp);
434 _nmalloc_thr_init(void)
442 TAILQ_INIT(&slglobal.full_zones);
443 slglobal.sldepot = &sldepots[slgi & (NDEPOTS - 1)];
451 pthread_once(&thread_malloc_once, _nmalloc_thr_init_once);
453 pthread_setspecific(thread_malloc_key, &slglobal);
461 _nmalloc_thr_init_once(void)
465 error = pthread_key_create(&thread_malloc_key, _nmalloc_thr_destructor);
471 * Called for each thread undergoing exit
473 * Move all of the thread's slabs into a depot.
476 _nmalloc_thr_destructor(void *thrp)
478 slglobaldata_t slgd = thrp;
484 for (i = 0; i <= slgd->biggest_index; i++) {
485 while ((slab = slgd->zone[i].empty_base) != NULL) {
486 slgd->zone[i].empty_base = slab->next;
487 slabterm(slgd, slab);
490 while ((slab = slgd->zone[i].avail_base) != NULL) {
491 slgd->zone[i].avail_base = slab->next;
492 slabterm(slgd, slab);
495 while ((slab = TAILQ_FIRST(&slgd->full_zones)) != NULL) {
496 TAILQ_REMOVE(&slgd->full_zones, slab, entry);
497 slabterm(slgd, slab);
503 * Calculate the zone index for the allocation request size and set the
504 * allocation request size to that particular zone's chunk size.
507 zoneindex(size_t *bytes, size_t *chunking)
509 size_t n = (size_t)*bytes;
515 *bytes = n = (n + 7) & ~7;
517 return(n / 8); /* 8 byte chunks, 16 zones */
521 c = x / (CHUNKFACTOR * 2);
525 c = x / (CHUNKFACTOR * 2);
526 i = 16 + CHUNKFACTOR * 5; /* 256->512,1024,2048,4096,8192 */
533 _mpanic("slaballoc: byte value too high");
535 *bytes = n = roundup2(n, c);
537 return (i + n / c - CHUNKFACTOR);
539 *bytes = n = (n + c - 1) & ~(c - 1);
544 *bytes = n = (n + 15) & ~15;
546 return(n / (CHUNKINGLO*2) + CHUNKINGLO*1 - 1);
550 *bytes = n = (n + 31) & ~31;
552 return(n / (CHUNKINGLO*4) + CHUNKINGLO*2 - 1);
555 *bytes = n = (n + 63) & ~63;
557 return(n / (CHUNKINGLO*8) + CHUNKINGLO*3 - 1);
560 *bytes = n = (n + 127) & ~127;
562 return(n / (CHUNKINGLO*16) + CHUNKINGLO*4 - 1);
565 *bytes = n = (n + 255) & ~255;
567 return(n / (CHUNKINGLO*32) + CHUNKINGLO*5 - 1);
569 *bytes = n = (n + 511) & ~511;
571 return(n / (CHUNKINGLO*64) + CHUNKINGLO*6 - 1);
574 *bytes = n = (n + 1023) & ~1023;
576 return(n / (CHUNKINGLO*128) + CHUNKINGLO*7 - 1);
578 if (n < 32768) { /* 16384-32767 */
579 *bytes = n = (n + 2047) & ~2047;
581 return(n / (CHUNKINGLO*256) + CHUNKINGLO*8 - 1);
584 *bytes = n = (n + 4095) & ~4095; /* 32768-65535 */
586 return(n / (CHUNKINGLO*512) + CHUNKINGLO*9 - 1);
591 i = CHUNKINGLO*10 - 1;
598 _mpanic("slaballoc: byte value too high");
600 *bytes = n = (n + c - 1) & ~(c - 1);
607 * malloc() - call internal slab allocator
614 ptr = memalloc(size, 0);
618 UTRACE(0, size, ptr);
623 * calloc() - call internal slab allocator
626 calloc(size_t number, size_t size)
630 ptr = memalloc(number * size, SAFLAG_ZERO);
634 UTRACE(0, number * size, ptr);
639 * realloc() (SLAB ALLOCATOR)
641 * We do not attempt to optimize this routine beyond reusing the same
642 * pointer if the new size fits within the chunking of the old pointer's
646 realloc(void *ptr, size_t size)
651 ret = memalloc(size, 0);
653 ret = memrealloc(ptr, size);
657 UTRACE(ptr, size, ret);
664 * Allocate (size) bytes with a alignment of (alignment), where (alignment)
665 * is a power of 2 >= sizeof(void *).
667 * The slab allocator will allocate on power-of-2 boundaries up to at least
668 * PAGE_SIZE. Otherwise we use the zoneindex mechanic to find a zone
669 * matching the requirements.
672 posix_memalign(void **memptr, size_t alignment, size_t size)
675 * OpenGroup spec issue 6 checks
677 if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) {
681 if (alignment < sizeof(void *)) {
687 * XXX for now just find the nearest power of 2 >= size and also
688 * >= alignment and allocate that.
690 while (alignment < size) {
693 _mpanic("posix_memalign: byte value too high");
695 *memptr = memalloc(alignment, 0);
696 return(*memptr ? 0 : ENOMEM);
700 * free() (SLAB ALLOCATOR) - do the obvious
712 * memalloc() (SLAB ALLOCATOR)
714 * Allocate memory via the slab allocator.
717 memalloc(size_t size, int flags)
720 struct zoneinfo *zinfo;
737 * If 0 bytes is requested we have to return a unique pointer, allocate
743 /* Capture global flags */
744 flags |= g_malloc_flags;
746 /* Compute allocation zone; zoneindex will panic on excessive sizes */
747 zi = zoneindex(&size, &chunking);
748 MASSERT(zi < NZONES);
753 * Locate a slab with available space. If no slabs are available
754 * back-off to the empty list and if we still come up dry allocate
755 * a new slab (which will try the depot first).
759 zinfo = &slgd->zone[zi];
760 if ((slab = zinfo->avail_base) == NULL) {
761 if ((slab = zinfo->empty_base) == NULL) {
765 slab = slaballoc(zi, chunking, size);
768 slab->next = zinfo->avail_base;
769 zinfo->avail_base = slab;
771 if (slgd->biggest_index < zi)
772 slgd->biggest_index = zi;
776 * Pulled from empty list
778 zinfo->empty_base = slab->next;
779 slab->next = zinfo->avail_base;
780 zinfo->avail_base = slab;
782 --zinfo->empty_count;
787 * Allocate a chunk out of the slab. HOT PATH
789 * Only the thread owning the slab can allocate out of it.
791 * NOTE: The last bit in the bitmap is always marked allocated so
792 * we cannot overflow here.
794 bno = slab->free_bit;
795 bmi = slab->free_index;
796 bmp = &slab->bitmap[bmi];
797 if (*bmp & (1LU << bno)) {
798 atomic_clear_long(bmp, 1LU << bno);
799 obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size;
800 slab->free_bit = (bno + 1) & (LONG_BITS - 1);
801 atomic_add_int(&slab->navail, -1);
802 if (flags & SAFLAG_ZERO)
808 * Allocate a chunk out of a slab. COLD PATH
810 if (slab->navail == 0) {
811 zinfo->avail_base = slab->next;
813 TAILQ_INSERT_TAIL(&slgd->full_zones, slab, entry);
817 while (bmi < LONG_BITS) {
818 bmp = &slab->bitmap[bmi];
821 atomic_clear_long(bmp, 1LU << bno);
822 obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
824 slab->free_index = bmi;
825 slab->free_bit = (bno + 1) & (LONG_BITS - 1);
826 atomic_add_int(&slab->navail, -1);
827 if (flags & SAFLAG_ZERO)
834 while (bmi < LONG_BITS) {
835 bmp = &slab->bitmap[bmi];
838 atomic_clear_long(bmp, 1LU << bno);
839 obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
841 slab->free_index = bmi;
842 slab->free_bit = (bno + 1) & (LONG_BITS - 1);
843 atomic_add_int(&slab->navail, -1);
844 if (flags & SAFLAG_ZERO)
850 _mpanic("slaballoc: corrupted zone: navail %d", slab->navail);
856 * Reallocate memory within the chunk
859 memrealloc(void *ptr, size_t nsize)
868 * If 0 bytes is requested we have to return a unique pointer, allocate
874 /* Capture global flags */
875 flags |= g_malloc_flags;
878 * Locate the zone by looking up the dynamic slab size mask based
879 * on the memory region the allocation resides in.
881 region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
882 if ((slab = region->slab) == NULL)
883 slab = (void *)((uintptr_t)ptr & region->mask);
884 MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
885 osize = slab->chunk_size;
886 if (nsize <= osize) {
887 if (osize < 32 || nsize >= osize / 2) {
889 if ((flags & SAFLAG_ZERO) && nsize < osize)
890 bzero(obj + nsize, osize - nsize);
896 * Otherwise resize the object
898 obj = memalloc(nsize, 0);
902 bcopy(ptr, obj, nsize);
909 * free (SLAB ALLOCATOR)
911 * Free a memory block previously allocated by malloc.
916 memfree(void *ptr, int flags)
929 * Locate the zone by looking up the dynamic slab size mask based
930 * on the memory region the allocation resides in.
932 * WARNING! The slab may be owned by another thread!
934 region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
935 if ((slab = region->slab) == NULL)
936 slab = (void *)((uintptr_t)ptr & region->mask);
937 MASSERT(slab != NULL);
938 MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
942 * Put weird data into the memory to detect modifications after
943 * freeing, illegal pointer use after freeing (we should fault on
944 * the odd address), and so forth.
946 if (slab->chunk_size < sizeof(weirdary))
947 bcopy(weirdary, ptr, slab->chunk_size);
949 bcopy(weirdary, ptr, sizeof(weirdary));
952 bno = ((uintptr_t)ptr - (uintptr_t)slab->chunks) / slab->chunk_size;
953 bmi = bno >> LONG_BITS_SHIFT;
954 bno &= (LONG_BITS - 1);
955 bmp = &slab->bitmap[bmi];
957 MASSERT(bmi >= 0 && bmi < slab->nmax);
958 MASSERT((*bmp & (1LU << bno)) == 0);
959 atomic_set_long(bmp, 1LU << bno);
960 atomic_add_int(&slab->navail, 1);
963 * We can only do the following if we own the slab
966 if (slab->slgd == slgd) {
967 struct zoneinfo *zinfo;
969 if (slab->free_index > bmi) {
970 slab->free_index = bmi;
971 slab->free_bit = bno;
972 } else if (slab->free_index == bmi &&
973 slab->free_bit > bno) {
974 slab->free_bit = bno;
976 zinfo = &slgd->zone[slab->zone_index];
979 * Freeing an object from a full slab will move it to the
980 * available list. If the available list already has a
981 * slab we terminate the full slab instead, moving it to
984 if (slab->state == FULL) {
985 TAILQ_REMOVE(&slgd->full_zones, slab, entry);
986 if (zinfo->avail_base == NULL) {
988 stmp = zinfo->avail_base;
990 zinfo->avail_base = slab;
992 slabterm(slgd, slab);
998 * If the slab becomes completely empty dispose of it in
999 * some manner. By default each thread caches up to 4
1000 * empty slabs. Only small slabs are cached.
1002 if (slab->navail == slab->nmax && slab->state == AVAIL) {
1004 * Remove slab from available queue
1006 slabp = &zinfo->avail_base;
1007 while ((stmp = *slabp) != slab)
1008 slabp = &stmp->next;
1009 *slabp = slab->next;
1011 if (opt_free || opt_cache == 0) {
1013 * If local caching is disabled cache the
1014 * slab in the depot (or free it).
1016 slabterm(slgd, slab);
1017 } else if (slab->slab_size > BIGSLABSIZE) {
1019 * We do not try to retain large slabs
1020 * in per-thread caches.
1022 slabterm(slgd, slab);
1023 } else if (zinfo->empty_count > opt_cache) {
1025 * We have too many slabs cached, but
1026 * instead of freeing this one free
1027 * an empty slab that's been idle longer.
1029 * (empty_count does not change)
1031 stmp = zinfo->empty_base;
1032 slab->state = EMPTY;
1033 slab->next = stmp->next;
1034 zinfo->empty_base = slab;
1035 slabterm(slgd, stmp);
1038 * Cache the empty slab in our thread local
1041 ++zinfo->empty_count;
1042 slab->state = EMPTY;
1043 slab->next = zinfo->empty_base;
1044 zinfo->empty_base = slab;
1053 * Allocate a new slab holding objects of size chunk_size.
1056 slaballoc(int zi, size_t chunking, size_t chunk_size)
1058 slglobaldata_t slgd;
1059 slglobaldata_t sldepot;
1060 struct zoneinfo *zinfo;
1068 uintptr_t chunk_offset;
1079 * First look in the depot. Any given zone in the depot may be
1080 * locked by being set to -1. We have to do this instead of simply
1081 * removing the entire chain because removing the entire chain can
1082 * cause racing threads to allocate local slabs for large objects,
1083 * resulting in a large VSZ.
1086 sldepot = slgd->sldepot;
1087 zinfo = &sldepot->zone[zi];
1089 while ((slab = zinfo->avail_base) != NULL) {
1090 if ((void *)slab == LOCKEDPTR) {
1094 if (atomic_cmpset_ptr(&zinfo->avail_base, slab, LOCKEDPTR)) {
1095 MASSERT(slab->slgd == NULL);
1097 zinfo->avail_base = slab->next;
1103 * Nothing in the depot, allocate a new slab by locating or assigning
1104 * a region and then using the system virtual memory allocator.
1109 * Calculate the start of the data chunks relative to the start
1112 if ((chunk_size ^ (chunk_size - 1)) == (chunk_size << 1) - 1) {
1114 chunk_offset = roundup2(sizeof(*slab), chunk_size);
1117 chunk_offset = sizeof(*slab) + chunking - 1;
1118 chunk_offset -= chunk_offset % chunking;
1122 * Calculate a reasonable number of chunks for the slab.
1124 * Once initialized the MaxChunks[] array can only ever be
1125 * reinitialized to the same value.
1127 maxchunks = MaxChunks[zi];
1128 if (maxchunks == 0) {
1130 * First calculate how many chunks would fit in 1/1024
1131 * available memory. This is around 2MB on a 32 bit
1132 * system and 128G on a 64-bit (48-bits available) system.
1134 maxchunks = (ssize_t)(NREGIONS_SIZE - chunk_offset) /
1135 (ssize_t)chunk_size;
1140 * A slab cannot handle more than MAXCHUNKS chunks, but
1141 * limit us to approximately MAXCHUNKS / 2 here because
1142 * we may have to expand maxchunks when we calculate the
1143 * actual power-of-2 slab.
1145 if (maxchunks > MAXCHUNKS / 2)
1146 maxchunks = MAXCHUNKS / 2;
1149 * Try to limit the slabs to BIGSLABSIZE (~128MB). Larger
1150 * slabs will be created if the allocation does not fit.
1152 if (chunk_offset + chunk_size * maxchunks > BIGSLABSIZE) {
1153 tmpchunks = (ssize_t)(BIGSLABSIZE - chunk_offset) /
1154 (ssize_t)chunk_size;
1157 if (maxchunks > tmpchunks)
1158 maxchunks = tmpchunks;
1162 * If the slab calculates to greater than 2MB see if we
1163 * can cut it down to ~2MB. This controls VSZ but has
1164 * no effect on run-time size or performance.
1166 * This is very important in case you core dump and also
1167 * important to reduce unnecessary region allocations.
1169 if (chunk_offset + chunk_size * maxchunks > NOMSLABSIZE) {
1170 tmpchunks = (ssize_t)(NOMSLABSIZE - chunk_offset) /
1171 (ssize_t)chunk_size;
1174 if (maxchunks > tmpchunks)
1175 maxchunks = tmpchunks;
1179 * If the slab calculates to greater than 128K see if we
1180 * can cut it down to ~128K while still maintaining a
1181 * reasonably large number of chunks in each slab. This
1182 * controls VSZ but has no effect on run-time size or
1185 * This is very important in case you core dump and also
1186 * important to reduce unnecessary region allocations.
1188 if (chunk_offset + chunk_size * maxchunks > LITSLABSIZE) {
1189 tmpchunks = (ssize_t)(LITSLABSIZE - chunk_offset) /
1190 (ssize_t)chunk_size;
1193 if (maxchunks > tmpchunks)
1194 maxchunks = tmpchunks;
1197 MaxChunks[zi] = maxchunks;
1199 MASSERT(maxchunks > 0 && maxchunks <= MAXCHUNKS);
1202 * Calculate the actual slab size. maxchunks will be recalculated
1205 slab_desire = chunk_offset + chunk_size * maxchunks;
1206 slab_size = 8 * MAXCHUNKS;
1207 power = 3 + MAXCHUNKS_BITS;
1208 while (slab_size < slab_desire) {
1214 * Do a quick recalculation based on the actual slab size but not
1215 * yet dealing with whether the slab header is in-band or out-of-band.
1216 * The purpose here is to see if we can reasonably reduce slab_size
1217 * to a power of 4 to allow more chunk sizes to use the same slab
1220 if ((power & 1) && slab_size > 32768) {
1221 maxchunks = (slab_size - chunk_offset) / chunk_size;
1222 if (maxchunks >= MAXCHUNKS / 8) {
1227 if ((power & 2) && slab_size > 32768 * 4) {
1228 maxchunks = (slab_size - chunk_offset) / chunk_size;
1229 if (maxchunks >= MAXCHUNKS / 4) {
1235 * This case occurs when the slab_size is larger than 1/1024 available
1238 nswath = slab_size / NREGIONS_SIZE;
1239 if (nswath > NREGIONS)
1244 * Try to allocate from our current best region for this zi
1246 region_mask = ~(slab_size - 1);
1247 ri = slgd->zone[zi].best_region;
1248 if (Regions[ri].mask == region_mask) {
1249 if ((slab = _vmem_alloc(ri, slab_size)) != NULL)
1254 * Try to find an existing region to allocate from. The normal
1255 * case will be for allocations that are less than 1/1024 available
1256 * UVM, which fit into a single Regions[] entry.
1258 while (slab_size <= NREGIONS_SIZE) {
1260 for (ri = 0; ri < NREGIONS; ++ri) {
1261 if (rx < 0 && Regions[ri].mask == 0)
1263 if (Regions[ri].mask == region_mask) {
1264 slab = _vmem_alloc(ri, slab_size);
1266 slgd->zone[zi].best_region = ri;
1276 * This can fail, retry either way
1278 atomic_cmpset_ptr((void **)&Regions[rx].mask,
1280 (void *)region_mask);
1285 for (ri = 0; ri < NREGIONS; ri += nswath) {
1286 if (Regions[ri].mask == region_mask) {
1287 slab = _vmem_alloc(ri, slab_size);
1289 slgd->zone[zi].best_region = ri;
1294 for (j = nswath - 1; j >= 0; --j) {
1295 if (Regions[ri+j].mask != 0)
1304 * We found a candidate, try to allocate it backwards so
1305 * another thread racing a slaballoc() does not see the
1306 * mask in the base index position until we are done.
1308 * We can safely zero-out any partial allocations because
1309 * the mask is only accessed from the base index. Any other
1310 * threads racing us will fail prior to us clearing the mask.
1314 for (j = nswath - 1; j >= 0; --j) {
1315 if (!atomic_cmpset_ptr((void **)&Regions[rx+j].mask,
1316 NULL, (void *)region_mask)) {
1317 while (++j < nswath)
1318 Regions[rx+j].mask = 0;
1326 * Fill in the new slab in region ri. If the slab_size completely
1327 * fills one or more region slots we move the slab structure out of
1328 * band which should optimize the chunking (particularly for a power
1332 region = &Regions[ri];
1333 MASSERT(region->slab == NULL);
1334 if (slab_size >= NREGIONS_SIZE) {
1336 slab = memalloc(sizeof(*slab), 0);
1337 bzero(slab, sizeof(*slab));
1338 slab->chunks = save;
1339 for (j = 0; j < nswath; ++j)
1340 region[j].slab = slab;
1343 bzero(slab, sizeof(*slab));
1344 slab->chunks = (char *)slab + chunk_offset;
1348 * Calculate the start of the chunks memory and recalculate the
1349 * actual number of chunks the slab can hold.
1351 maxchunks = (slab_size - chunk_offset) / chunk_size;
1352 if (maxchunks > MAXCHUNKS)
1353 maxchunks = MAXCHUNKS;
1356 * And fill in the rest
1358 slab->magic = ZALLOC_SLAB_MAGIC;
1359 slab->navail = maxchunks;
1360 slab->nmax = maxchunks;
1361 slab->slab_size = slab_size;
1362 slab->chunk_size = chunk_size;
1363 slab->zone_index = zi;
1365 slab->state = UNKNOWN;
1366 slab->region = region;
1368 for (ri = 0; ri < maxchunks; ri += LONG_BITS) {
1369 if (ri + LONG_BITS <= maxchunks)
1370 slab->bitmap[ri >> LONG_BITS_SHIFT] = ULONG_MAX;
1372 slab->bitmap[ri >> LONG_BITS_SHIFT] =
1373 (1LU << (maxchunks - ri)) - 1;
1382 slabfree(slab_t slab)
1387 if (slab->region->slab == slab) {
1391 nswath = slab->slab_size / NREGIONS_SIZE;
1392 for (j = 0; j < nswath; ++j)
1393 slab->region[j].slab = NULL;
1395 _vmem_free(slab->chunks, slab->slab_size);
1402 _vmem_free(slab, slab->slab_size);
1407 * Terminate a slab's use in the current thread. The slab may still have
1408 * outstanding allocations and thus not be deallocatable.
1411 slabterm(slglobaldata_t slgd, slab_t slab)
1413 slglobaldata_t sldepot = slgd->sldepot;
1414 struct zoneinfo *zinfo;
1416 int zi = slab->zone_index;
1420 zinfo = &sldepot->zone[zi];
1423 * If the slab can be freed and the depot is either locked or not
1424 * empty, then free the slab.
1426 if (slab->navail == slab->nmax && zinfo->avail_base) {
1427 slab->state = UNKNOWN;
1431 slab->state = AVAIL;
1434 * Link the slab into the depot
1437 dnext = zinfo->avail_base;
1439 if ((void *)dnext == LOCKEDPTR) {
1444 if (atomic_cmpset_ptr(&zinfo->avail_base, dnext, slab))
1452 * Directly map memory in PAGE_SIZE'd chunks with the specified
1455 * Alignment must be a multiple of PAGE_SIZE.
1457 * Size must be >= alignment.
1460 _vmem_alloc(int ri, size_t slab_size)
1462 char *baddr = (void *)((uintptr_t)ri << NREGIONS_SHIFT);
1468 if (slab_size < NREGIONS_SIZE)
1469 eaddr = baddr + NREGIONS_SIZE;
1471 eaddr = baddr + slab_size;
1474 * This usually just works but might not.
1476 addr = mmap(baddr, slab_size, PROT_READ|PROT_WRITE,
1477 MAP_PRIVATE | MAP_ANON | MAP_SIZEALIGN, -1, 0);
1478 if (addr == MAP_FAILED) {
1481 if (addr < baddr || addr + slab_size > eaddr) {
1482 munmap(addr, slab_size);
1487 * Check alignment. The misaligned offset is also the excess
1488 * amount. If misaligned unmap the excess so we have a chance of
1489 * mapping at the next alignment point and recursively try again.
1491 * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB block alignment
1492 * aaaaaaaaa aaaaaaaaaaa aa mis-aligned allocation
1493 * xxxxxxxxx final excess calculation
1494 * ^ returned address
1496 excess = (uintptr_t)addr & (slab_size - 1);
1498 excess = slab_size - excess;
1501 munmap(save + excess, slab_size - excess);
1502 addr = _vmem_alloc(ri, slab_size);
1503 munmap(save, excess);
1506 if (addr < baddr || addr + slab_size > eaddr) {
1507 munmap(addr, slab_size);
1510 excess = (uintptr_t)addr & (slab_size - 1);
1518 * Free a chunk of memory allocated with _vmem_alloc()
1521 _vmem_free(void *ptr, size_t size)
1527 * Panic on fatal conditions
1530 _mpanic(const char *ctl, ...)
1534 if (malloc_panic == 0) {
1537 vfprintf(stderr, ctl, va);
1538 fprintf(stderr, "\n");