/* * DMALLOC.C - Dillon's malloc * * Copyright (c) 2011 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Matthew Dillon . * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * 3. Neither the name of The DragonFly Project nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific, prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module implements a modified slab allocator drop-in replacement for * the libc malloc(). The slab algorithm has been adjusted to support dynamic * sizing of slabs which effectively allows slabs to be used for allocations of * any size. Because of this we neither have a small-block allocator or a * big-block allocator and the code paths are simplified to the point where * allocations, caching, and freeing, is screaming fast. * * There is very little interaction between threads. A global depot accessed * via atomic cmpxchg instructions (only! no spinlocks!) is used as a * catch-all and to deal with thread exits and such. * * To support dynamic slab sizing available user virtual memory is broken * down into ~1024 regions. Each region has fixed slab size whos value is * set when the region is opened up for use. The free() path simply applies * a mask based on the region to the pointer to acquire the base of the * governing slab structure. * * Regions[NREGIONS] (1024) * * Slab management and locking is done on a per-zone basis. * * Alloc Size Chunking Number of zones * 0-127 8 16 * 128-255 16 8 * 256-511 32 8 * 512-1023 64 8 * 1024-2047 128 8 * 2048-4095 256 8 * 4096-8191 512 8 * 8192-16383 1024 8 * 16384-32767 2048 8 * 32768-65535 4096 8 * ... continues unlimited ... 4 zones * * For a 2^63 memory space each doubling >= 64K is broken down into * 4 chunking zones, so we support 88 + (48 * 4) = 280 zones. * * API FEATURES AND SIDE EFFECTS * * + power-of-2 sized allocations up to a page will be power-of-2 aligned. * Above that power-of-2 sized allocations are page-aligned. Non * power-of-2 sized allocations are aligned the same as the chunk * size for their zone. * + malloc(0) returns a special non-NULL value * + ability to allocate arbitrarily large chunks of memory * + realloc will reuse the passed pointer if possible, within the * limitations of the zone chunking. * * FUTURE FEATURES * * + [better] garbage collection * + better initial sizing. * * TUNING * * The value of the environment variable MALLOC_OPTIONS is a character string * containing various flags to tune nmalloc. Upper case letters enabled * or increase the feature, lower case disables or decreases the feature. * * U Enable UTRACE for all operations, observable with ktrace. * Diasbled by default. * * Z Zero out allocations, otherwise allocations (except for * calloc) will contain garbage. * Disabled by default. * * H Pass a hint with madvise() about unused pages. * Disabled by default. * Not currently implemented. * * F Disable local per-thread caching. * Disabled by default. * * C Increase (decrease) how much excess cache to retain. * Set to 4 by default. */ /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o dmalloc.so dmalloc.c */ #ifndef STANDALONE_DEBUG #include "libc_private.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef STANDALONE_DEBUG void _nmalloc_thr_init(void); #else #include "spinlock.h" #include "un-namespace.h" #endif #ifndef MAP_SIZEALIGN #define MAP_SIZEALIGN 0 #endif #if SSIZE_MAX == 0x7FFFFFFF #define ADDRBITS 32 #define UVM_BITS 32 /* worst case */ #else #define ADDRBITS 64 #define UVM_BITS 48 /* worst case XXX */ #endif #if LONG_MAX == 0x7FFFFFFF #define LONG_BITS 32 #define LONG_BITS_SHIFT 5 #else #define LONG_BITS 64 #define LONG_BITS_SHIFT 6 #endif #define LOCKEDPTR ((void *)(intptr_t)-1) /* * Regions[] */ #define NREGIONS_BITS 10 #define NREGIONS (1 << NREGIONS_BITS) #define NREGIONS_MASK (NREGIONS - 1) #define NREGIONS_SHIFT (UVM_BITS - NREGIONS_BITS) #define NREGIONS_SIZE (1LU << NREGIONS_SHIFT) typedef struct region *region_t; typedef struct slglobaldata *slglobaldata_t; typedef struct slab *slab_t; struct region { uintptr_t mask; slab_t slab; /* conditional out of band slab */ }; static struct region Regions[NREGIONS]; /* * Number of chunking zones available */ #define CHUNKFACTOR 8 #if ADDRBITS == 32 #define NZONES (16 + 9 * CHUNKFACTOR + 16 * CHUNKFACTOR) #else #define NZONES (16 + 9 * CHUNKFACTOR + 48 * CHUNKFACTOR) #endif static int MaxChunks[NZONES]; #define NDEPOTS 8 /* must be power of 2 */ /* * Maximum number of chunks per slab, governed by the allocation bitmap in * each slab. The maximum is reduced for large chunk sizes. */ #define MAXCHUNKS (LONG_BITS * LONG_BITS) #define MAXCHUNKS_BITS (LONG_BITS_SHIFT * LONG_BITS_SHIFT) #define LITSLABSIZE (32 * 1024) #define NOMSLABSIZE (2 * 1024 * 1024) #define BIGSLABSIZE (128 * 1024 * 1024) #define ZALLOC_SLAB_MAGIC 0x736c6162 /* magic sanity */ TAILQ_HEAD(slab_list, slab); /* * A slab structure */ struct slab { struct slab *next; /* slabs with available space */ TAILQ_ENTRY(slab) entry; int32_t magic; /* magic number for sanity check */ u_int navail; /* number of free elements available */ u_int nmax; u_int free_bit; /* free hint bitno */ u_int free_index; /* free hint index */ u_long bitmap[LONG_BITS]; /* free chunks */ size_t slab_size; /* size of entire slab */ size_t chunk_size; /* chunk size for validation */ int zone_index; enum { UNKNOWN, AVAIL, EMPTY, FULL } state; int flags; region_t region; /* related region */ char *chunks; /* chunk base */ slglobaldata_t slgd; }; /* * per-thread data */ struct slglobaldata { struct zoneinfo { slab_t avail_base; slab_t empty_base; int best_region; int empty_count; } zone[NZONES]; struct slab_list full_zones; /* via entry */ int masked; int biggest_index; size_t nslabs; slglobaldata_t sldepot; }; #define SLAB_ZEROD 0x0001 /* * Misc constants. Note that allocations that are exact multiples of * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module. * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists. */ #define MIN_CHUNK_SIZE 8 /* in bytes */ #define MIN_CHUNK_MASK (MIN_CHUNK_SIZE - 1) #define SAFLAG_ZERO 0x00000001 /* * The WEIRD_ADDR is used as known text to copy into free objects to * try to create deterministic failure cases if the data is accessed after * free. * * WARNING: A limited number of spinlocks are available, BIGXSIZE should * not be larger then 64. */ #define WEIRD_ADDR 0xdeadc0de #define MAX_COPY sizeof(weirdary) #define arysize(ary) (sizeof(ary)/sizeof((ary)[0])) /* * Thread control */ #define MASSERT(exp) do { if (__predict_false(!(exp))) \ _mpanic("assertion: %s in %s", \ #exp, __func__); \ } while (0) /* With this attribute set, do not require a function call for accessing * this variable when the code is compiled -fPIC */ #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec"))); static __thread struct slglobaldata slglobal TLS_ATTRIBUTE; static pthread_key_t thread_malloc_key; static pthread_once_t thread_malloc_once = PTHREAD_ONCE_INIT; static struct slglobaldata sldepots[NDEPOTS]; static int opt_madvise = 0; static int opt_free = 0; static int opt_cache = 4; static int opt_utrace = 0; static int g_malloc_flags = 0; static int malloc_panic; static int malloc_started = 0; static const int32_t weirdary[16] = { WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR }; static void *memalloc(size_t size, int flags); static void *memrealloc(void *ptr, size_t size); static void memfree(void *ptr, int); static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size); static void slabfree(slab_t slab); static void slabterm(slglobaldata_t slgd, slab_t slab); static void *_vmem_alloc(int ri, size_t slab_size); static void _vmem_free(void *ptr, size_t slab_size); static void _mpanic(const char *ctl, ...) __printflike(1, 2); #ifndef STANDALONE_DEBUG static void malloc_init(void) __constructor(0); #else static void malloc_init(void) __constructor(101); #endif struct nmalloc_utrace { void *p; size_t s; void *r; }; #define UTRACE(a, b, c) \ if (opt_utrace) { \ struct nmalloc_utrace ut = { \ .p = (a), \ .s = (b), \ .r = (c) \ }; \ utrace(&ut, sizeof(ut)); \ } #ifdef INVARIANTS /* * If enabled any memory allocated without M_ZERO is initialized to -1. */ static int use_malloc_pattern; #endif static void malloc_init(void) { const char *p = NULL; static spinlock_t malloc_init_lock; if (malloc_started) return; if (__isthreaded) { _SPINLOCK(&malloc_init_lock); if (malloc_started) { _SPINUNLOCK(&malloc_init_lock); return; } } Regions[0].mask = -1; /* disallow activity in lowest region */ if (issetugid() == 0) p = getenv("MALLOC_OPTIONS"); for (; p != NULL && *p != '\0'; p++) { switch(*p) { case 'u': opt_utrace = 0; break; case 'U': opt_utrace = 1; break; case 'h': opt_madvise = 0; break; case 'H': opt_madvise = 1; break; case 'c': if (opt_cache > 0) --opt_cache; break; case 'C': ++opt_cache; break; case 'f': opt_free = 0; break; case 'F': opt_free = 1; break; case 'z': g_malloc_flags = 0; break; case 'Z': g_malloc_flags = SAFLAG_ZERO; break; default: break; } } UTRACE((void *) -1, 0, NULL); _nmalloc_thr_init(); malloc_started = 1; if (__isthreaded) _SPINUNLOCK(&malloc_init_lock); } /* * We have to install a handler for nmalloc thread teardowns when * the thread is created. We cannot delay this because destructors in * sophisticated userland programs can call malloc() for the first time * during their thread exit. * * This routine is called directly from pthreads. */ static void _nmalloc_thr_init_once(void); static void _nmalloc_thr_destructor(void *thrp); void _nmalloc_thr_init(void) { static int did_init; static int SLGI; int slgi; slgi = SLGI++; cpu_ccfence(); TAILQ_INIT(&slglobal.full_zones); slglobal.sldepot = &sldepots[slgi & (NDEPOTS - 1)]; if (slglobal.masked) return; slglobal.masked = 1; if (did_init == 0) { did_init = 1; pthread_once(&thread_malloc_once, _nmalloc_thr_init_once); } pthread_setspecific(thread_malloc_key, &slglobal); slglobal.masked = 0; } /* * Called just once */ static void _nmalloc_thr_init_once(void) { int error; error = pthread_key_create(&thread_malloc_key, _nmalloc_thr_destructor); if (error) abort(); } /* * Called for each thread undergoing exit * * Move all of the thread's slabs into a depot. */ static void _nmalloc_thr_destructor(void *thrp) { slglobaldata_t slgd = thrp; slab_t slab; int i; slgd->masked = 1; for (i = 0; i <= slgd->biggest_index; i++) { while ((slab = slgd->zone[i].empty_base) != NULL) { slgd->zone[i].empty_base = slab->next; slabterm(slgd, slab); } while ((slab = slgd->zone[i].avail_base) != NULL) { slgd->zone[i].avail_base = slab->next; slabterm(slgd, slab); } while ((slab = TAILQ_FIRST(&slgd->full_zones)) != NULL) { TAILQ_REMOVE(&slgd->full_zones, slab, entry); slabterm(slgd, slab); } } } /* * Calculate the zone index for the allocation request size and set the * allocation request size to that particular zone's chunk size. */ static __inline int zoneindex(size_t *bytes, size_t *chunking) { size_t n = (size_t)*bytes; size_t x; size_t c; int i; if (n < 128) { *bytes = n = (n + 7) & ~7; *chunking = 8; return(n / 8); /* 8 byte chunks, 16 zones */ } if (n < 4096) { x = 256; c = x / (CHUNKFACTOR * 2); i = 16; } else { x = 8192; c = x / (CHUNKFACTOR * 2); i = 16 + CHUNKFACTOR * 5; /* 256->512,1024,2048,4096,8192 */ } while (n >= x) { x <<= 1; c <<= 1; i += CHUNKFACTOR; if (x == 0) _mpanic("slaballoc: byte value too high"); } *bytes = n = (n + c - 1) & ~(c - 1); *chunking = c; return (i + n / c - CHUNKFACTOR); #if 0 *bytes = n = (n + c - 1) & ~(c - 1); *chunking = c; return (n / c + i); if (n < 256) { *bytes = n = (n + 15) & ~15; *chunking = 16; return(n / (CHUNKINGLO*2) + CHUNKINGLO*1 - 1); } if (n < 8192) { if (n < 512) { *bytes = n = (n + 31) & ~31; *chunking = 32; return(n / (CHUNKINGLO*4) + CHUNKINGLO*2 - 1); } if (n < 1024) { *bytes = n = (n + 63) & ~63; *chunking = 64; return(n / (CHUNKINGLO*8) + CHUNKINGLO*3 - 1); } if (n < 2048) { *bytes = n = (n + 127) & ~127; *chunking = 128; return(n / (CHUNKINGLO*16) + CHUNKINGLO*4 - 1); } if (n < 4096) { *bytes = n = (n + 255) & ~255; *chunking = 256; return(n / (CHUNKINGLO*32) + CHUNKINGLO*5 - 1); } *bytes = n = (n + 511) & ~511; *chunking = 512; return(n / (CHUNKINGLO*64) + CHUNKINGLO*6 - 1); } if (n < 16384) { *bytes = n = (n + 1023) & ~1023; *chunking = 1024; return(n / (CHUNKINGLO*128) + CHUNKINGLO*7 - 1); } if (n < 32768) { /* 16384-32767 */ *bytes = n = (n + 2047) & ~2047; *chunking = 2048; return(n / (CHUNKINGLO*256) + CHUNKINGLO*8 - 1); } if (n < 65536) { *bytes = n = (n + 4095) & ~4095; /* 32768-65535 */ *chunking = 4096; return(n / (CHUNKINGLO*512) + CHUNKINGLO*9 - 1); } x = 131072; c = 8192; i = CHUNKINGLO*10 - 1; while (n >= x) { x <<= 1; c <<= 1; i += CHUNKINGHI; if (x == 0) _mpanic("slaballoc: byte value too high"); } *bytes = n = (n + c - 1) & ~(c - 1); *chunking = c; return (n / c + i); #endif } /* * malloc() - call internal slab allocator */ void * malloc(size_t size) { void *ptr; ptr = memalloc(size, 0); if (ptr == NULL) errno = ENOMEM; else UTRACE(0, size, ptr); return(ptr); } /* * calloc() - call internal slab allocator */ void * calloc(size_t number, size_t size) { void *ptr; ptr = memalloc(number * size, SAFLAG_ZERO); if (ptr == NULL) errno = ENOMEM; else UTRACE(0, number * size, ptr); return(ptr); } /* * realloc() (SLAB ALLOCATOR) * * We do not attempt to optimize this routine beyond reusing the same * pointer if the new size fits within the chunking of the old pointer's * zone. */ void * realloc(void *ptr, size_t size) { void *ret; if (ptr == NULL) ret = memalloc(size, 0); else ret = memrealloc(ptr, size); if (ret == NULL) errno = ENOMEM; else UTRACE(ptr, size, ret); return(ret); } /* * posix_memalign() * * Allocate (size) bytes with a alignment of (alignment), where (alignment) * is a power of 2 >= sizeof(void *). * * The slab allocator will allocate on power-of-2 boundaries up to at least * PAGE_SIZE. Otherwise we use the zoneindex mechanic to find a zone * matching the requirements. */ int posix_memalign(void **memptr, size_t alignment, size_t size) { size_t chunking; int zi; /* * OpenGroup spec issue 6 checks */ if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) { *memptr = NULL; return(EINVAL); } if (alignment < sizeof(void *)) { *memptr = NULL; return(EINVAL); } /* * XXX for now just find the nearest power of 2 >= size and also * >= alignment and allocate that. */ while (alignment < size) { alignment <<= 1; if (alignment == 0) _mpanic("posix_memalign: byte value too high"); } *memptr = memalloc(alignment, 0); return(*memptr ? 0 : ENOMEM); } /* * free() (SLAB ALLOCATOR) - do the obvious */ void free(void *ptr) { if (ptr) { UTRACE(ptr, 0, 0); memfree(ptr, 0); } } /* * memalloc() (SLAB ALLOCATOR) * * Allocate memory via the slab allocator. */ static void * memalloc(size_t size, int flags) { slglobaldata_t slgd; struct zoneinfo *zinfo; slab_t slab; size_t chunking; int bmi; int bno; u_long *bmp; int zi; #ifdef INVARIANTS int i; #endif size_t off; char *obj; if (!malloc_started) malloc_init(); /* * If 0 bytes is requested we have to return a unique pointer, allocate * at least one byte. */ if (size == 0) size = 1; /* Capture global flags */ flags |= g_malloc_flags; /* Compute allocation zone; zoneindex will panic on excessive sizes */ zi = zoneindex(&size, &chunking); MASSERT(zi < NZONES); if (size == 0) return(NULL); /* * Locate a slab with available space. If no slabs are available * back-off to the empty list and if we still come up dry allocate * a new slab (which will try the depot first). */ retry: slgd = &slglobal; zinfo = &slgd->zone[zi]; if ((slab = zinfo->avail_base) == NULL) { if ((slab = zinfo->empty_base) == NULL) { /* * Still dry */ slab = slaballoc(zi, chunking, size); if (slab == NULL) return(NULL); slab->next = zinfo->avail_base; zinfo->avail_base = slab; slab->state = AVAIL; if (slgd->biggest_index < zi) slgd->biggest_index = zi; ++slgd->nslabs; } else { /* * Pulled from empty list */ zinfo->empty_base = slab->next; slab->next = zinfo->avail_base; zinfo->avail_base = slab; slab->state = AVAIL; --zinfo->empty_count; } } /* * Allocate a chunk out of the slab. HOT PATH * * Only the thread owning the slab can allocate out of it. * * NOTE: The last bit in the bitmap is always marked allocated so * we cannot overflow here. */ bno = slab->free_bit; bmi = slab->free_index; bmp = &slab->bitmap[bmi]; if (*bmp & (1LU << bno)) { atomic_clear_long(bmp, 1LU << bno); obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size; slab->free_bit = (bno + 1) & (LONG_BITS - 1); atomic_add_int(&slab->navail, -1); if (flags & SAFLAG_ZERO) bzero(obj, size); return (obj); } /* * Allocate a chunk out of a slab. COLD PATH */ if (slab->navail == 0) { zinfo->avail_base = slab->next; slab->state = FULL; TAILQ_INSERT_TAIL(&slgd->full_zones, slab, entry); goto retry; } while (bmi < LONG_BITS) { bmp = &slab->bitmap[bmi]; if (*bmp) { bno = bsflong(*bmp); atomic_clear_long(bmp, 1LU << bno); obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size; slab->free_index = bmi; slab->free_bit = (bno + 1) & (LONG_BITS - 1); atomic_add_int(&slab->navail, -1); if (flags & SAFLAG_ZERO) bzero(obj, size); return (obj); } ++bmi; } bmi = 0; while (bmi < LONG_BITS) { bmp = &slab->bitmap[bmi]; if (*bmp) { bno = bsflong(*bmp); atomic_clear_long(bmp, 1LU << bno); obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size; slab->free_index = bmi; slab->free_bit = (bno + 1) & (LONG_BITS - 1); atomic_add_int(&slab->navail, -1); if (flags & SAFLAG_ZERO) bzero(obj, size); return (obj); } ++bmi; } _mpanic("slaballoc: corrupted zone: navail %d", slab->navail); /* not reached */ return NULL; } /* * Reallocate memory within the chunk */ static void * memrealloc(void *ptr, size_t nsize) { region_t region; slglobaldata_t slgd; slab_t slab; size_t osize; char *obj; int flags; /* * If 0 bytes is requested we have to return a unique pointer, allocate * at least one byte. */ if (nsize == 0) nsize = 1; /* Capture global flags */ flags |= g_malloc_flags; /* * Locate the zone by looking up the dynamic slab size mask based * on the memory region the allocation resides in. */ region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK]; if ((slab = region->slab) == NULL) slab = (void *)((uintptr_t)ptr & region->mask); MASSERT(slab->magic == ZALLOC_SLAB_MAGIC); osize = slab->chunk_size; if (nsize <= osize) { if (osize < 32 || nsize >= osize / 2) { obj = ptr; if ((flags & SAFLAG_ZERO) && nsize < osize) bzero(obj + nsize, osize - nsize); return(obj); } } /* * Otherwise resize the object */ obj = memalloc(nsize, 0); if (obj) { if (nsize > osize) nsize = osize; bcopy(ptr, obj, nsize); memfree(ptr, 0); } return (obj); } /* * free (SLAB ALLOCATOR) * * Free a memory block previously allocated by malloc. * * MPSAFE */ static void memfree(void *ptr, int flags) { region_t region; slglobaldata_t slgd; slab_t slab; slab_t stmp; slab_t *slabp; char *obj; int bmi; int bno; u_long *bmp; /* * Locate the zone by looking up the dynamic slab size mask based * on the memory region the allocation resides in. * * WARNING! The slab may be owned by another thread! */ region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK]; if ((slab = region->slab) == NULL) slab = (void *)((uintptr_t)ptr & region->mask); MASSERT(slab != NULL); MASSERT(slab->magic == ZALLOC_SLAB_MAGIC); #ifdef INVARIANTS /* * Put weird data into the memory to detect modifications after * freeing, illegal pointer use after freeing (we should fault on * the odd address), and so forth. */ if (slab->chunk_size < sizeof(weirdary)) bcopy(weirdary, ptr, slab->chunk_size); else bcopy(weirdary, ptr, sizeof(weirdary)); #endif bno = ((uintptr_t)ptr - (uintptr_t)slab->chunks) / slab->chunk_size; bmi = bno >> LONG_BITS_SHIFT; bno &= (LONG_BITS - 1); bmp = &slab->bitmap[bmi]; MASSERT(bmi >= 0 && bmi < slab->nmax); MASSERT((*bmp & (1LU << bno)) == 0); atomic_set_long(bmp, 1LU << bno); atomic_add_int(&slab->navail, 1); /* * We can only do the following if we own the slab */ slgd = &slglobal; if (slab->slgd == slgd) { struct zoneinfo *zinfo; if (slab->free_index > bmi) { slab->free_index = bmi; slab->free_bit = bno; } else if (slab->free_index == bmi && slab->free_bit > bno) { slab->free_bit = bno; } zinfo = &slgd->zone[slab->zone_index]; /* * Freeing an object from a full slab will move it to the * available list. If the available list already has a * slab we terminate the full slab instead, moving it to * the depot. */ if (slab->state == FULL) { TAILQ_REMOVE(&slgd->full_zones, slab, entry); if (zinfo->avail_base == NULL) { slab->state = AVAIL; stmp = zinfo->avail_base; slab->next = stmp; zinfo->avail_base = slab; } else { slabterm(slgd, slab); goto done; } } /* * If the slab becomes completely empty dispose of it in * some manner. By default each thread caches up to 4 * empty slabs. Only small slabs are cached. */ if (slab->navail == slab->nmax && slab->state == AVAIL) { /* * Remove slab from available queue */ slabp = &zinfo->avail_base; while ((stmp = *slabp) != slab) slabp = &stmp->next; *slabp = slab->next; if (opt_free || opt_cache == 0) { /* * If local caching is disabled cache the * slab in the depot (or free it). */ slabterm(slgd, slab); } else if (slab->slab_size > BIGSLABSIZE) { /* * We do not try to retain large slabs * in per-thread caches. */ slabterm(slgd, slab); } else if (zinfo->empty_count > opt_cache) { /* * We have too many slabs cached, but * instead of freeing this one free * an empty slab that's been idle longer. * * (empty_count does not change) */ stmp = zinfo->empty_base; slab->state = EMPTY; slab->next = stmp->next; zinfo->empty_base = slab; slabterm(slgd, stmp); } else { /* * Cache the empty slab in our thread local * empty list. */ ++zinfo->empty_count; slab->state = EMPTY; slab->next = zinfo->empty_base; zinfo->empty_base = slab; } } } done: ; } /* * Allocate a new slab holding objects of size chunk_size. */ static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size) { slglobaldata_t slgd; slglobaldata_t sldepot; struct zoneinfo *zinfo; region_t region; void *save; slab_t slab; slab_t stmp; size_t slab_desire; size_t slab_size; size_t region_mask; uintptr_t chunk_offset; ssize_t maxchunks; ssize_t tmpchunks; int ispower2; int power; int ri; int rx; int nswath; int j; /* * First look in the depot. Any given zone in the depot may be * locked by being set to -1. We have to do this instead of simply * removing the entire chain because removing the entire chain can * cause racing threads to allocate local slabs for large objects, * resulting in a large VSZ. */ slgd = &slglobal; sldepot = slgd->sldepot; zinfo = &sldepot->zone[zi]; while ((slab = zinfo->avail_base) != NULL) { if ((void *)slab == LOCKEDPTR) { cpu_pause(); continue; } if (atomic_cmpset_ptr(&zinfo->avail_base, slab, LOCKEDPTR)) { MASSERT(slab->slgd == NULL); slab->slgd = slgd; zinfo->avail_base = slab->next; return(slab); } } /* * Nothing in the depot, allocate a new slab by locating or assigning * a region and then using the system virtual memory allocator. */ slab = NULL; /* * Calculate the start of the data chunks relative to the start * of the slab. */ if ((chunk_size ^ (chunk_size - 1)) == (chunk_size << 1) - 1) { ispower2 = 1; chunk_offset = (sizeof(*slab) + (chunk_size - 1)) & ~(chunk_size - 1); } else { ispower2 = 0; chunk_offset = sizeof(*slab) + chunking - 1; chunk_offset -= chunk_offset % chunking; } /* * Calculate a reasonable number of chunks for the slab. * * Once initialized the MaxChunks[] array can only ever be * reinitialized to the same value. */ maxchunks = MaxChunks[zi]; if (maxchunks == 0) { /* * First calculate how many chunks would fit in 1/1024 * available memory. This is around 2MB on a 32 bit * system and 128G on a 64-bit (48-bits available) system. */ maxchunks = (ssize_t)(NREGIONS_SIZE - chunk_offset) / (ssize_t)chunk_size; if (maxchunks <= 0) maxchunks = 1; /* * A slab cannot handle more than MAXCHUNKS chunks, but * limit us to approximately MAXCHUNKS / 2 here because * we may have to expand maxchunks when we calculate the * actual power-of-2 slab. */ if (maxchunks > MAXCHUNKS / 2) maxchunks = MAXCHUNKS / 2; /* * Try to limit the slabs to BIGSLABSIZE (~128MB). Larger * slabs will be created if the allocation does not fit. */ if (chunk_offset + chunk_size * maxchunks > BIGSLABSIZE) { tmpchunks = (ssize_t)(BIGSLABSIZE - chunk_offset) / (ssize_t)chunk_size; if (tmpchunks <= 0) tmpchunks = 1; if (maxchunks > tmpchunks) maxchunks = tmpchunks; } /* * If the slab calculates to greater than 2MB see if we * can cut it down to ~2MB. This controls VSZ but has * no effect on run-time size or performance. * * This is very important in case you core dump and also * important to reduce unnecessary region allocations. */ if (chunk_offset + chunk_size * maxchunks > NOMSLABSIZE) { tmpchunks = (ssize_t)(NOMSLABSIZE - chunk_offset) / (ssize_t)chunk_size; if (tmpchunks < 1) tmpchunks = 1; if (maxchunks > tmpchunks) maxchunks = tmpchunks; } /* * If the slab calculates to greater than 128K see if we * can cut it down to ~128K while still maintaining a * reasonably large number of chunks in each slab. This * controls VSZ but has no effect on run-time size or * performance. * * This is very important in case you core dump and also * important to reduce unnecessary region allocations. */ if (chunk_offset + chunk_size * maxchunks > LITSLABSIZE) { tmpchunks = (ssize_t)(LITSLABSIZE - chunk_offset) / (ssize_t)chunk_size; if (tmpchunks < 32) tmpchunks = 32; if (maxchunks > tmpchunks) maxchunks = tmpchunks; } MaxChunks[zi] = maxchunks; } MASSERT(maxchunks > 0 && maxchunks <= MAXCHUNKS); /* * Calculate the actual slab size. maxchunks will be recalculated * a little later. */ slab_desire = chunk_offset + chunk_size * maxchunks; slab_size = 8 * MAXCHUNKS; power = 3 + MAXCHUNKS_BITS; while (slab_size < slab_desire) { slab_size <<= 1; ++power; } /* * Do a quick recalculation based on the actual slab size but not * yet dealing with whether the slab header is in-band or out-of-band. * The purpose here is to see if we can reasonably reduce slab_size * to a power of 4 to allow more chunk sizes to use the same slab * size. */ if ((power & 1) && slab_size > 32768) { maxchunks = (slab_size - chunk_offset) / chunk_size; if (maxchunks >= MAXCHUNKS / 8) { slab_size >>= 1; --power; } } if ((power & 2) && slab_size > 32768 * 4) { maxchunks = (slab_size - chunk_offset) / chunk_size; if (maxchunks >= MAXCHUNKS / 4) { slab_size >>= 2; power -= 2; } } /* * This case occurs when the slab_size is larger than 1/1024 available * UVM. */ nswath = slab_size / NREGIONS_SIZE; if (nswath > NREGIONS) return (NULL); /* * Try to allocate from our current best region for this zi */ region_mask = ~(slab_size - 1); ri = slgd->zone[zi].best_region; if (Regions[ri].mask == region_mask) { if ((slab = _vmem_alloc(ri, slab_size)) != NULL) goto found; } /* * Try to find an existing region to allocate from. The normal * case will be for allocations that are less than 1/1024 available * UVM, which fit into a single Regions[] entry. */ while (slab_size <= NREGIONS_SIZE) { rx = -1; for (ri = 0; ri < NREGIONS; ++ri) { if (rx < 0 && Regions[ri].mask == 0) rx = ri; if (Regions[ri].mask == region_mask) { slab = _vmem_alloc(ri, slab_size); if (slab) { slgd->zone[zi].best_region = ri; goto found; } } } if (rx < 0) return(NULL); /* * This can fail, retry either way */ atomic_cmpset_ptr((void **)&Regions[rx].mask, NULL, (void *)region_mask); } for (;;) { rx = -1; for (ri = 0; ri < NREGIONS; ri += nswath) { if (Regions[ri].mask == region_mask) { slab = _vmem_alloc(ri, slab_size); if (slab) { slgd->zone[zi].best_region = ri; goto found; } } if (rx < 0) { for (j = nswath - 1; j >= 0; --j) { if (Regions[ri+j].mask != 0) break; } if (j < 0) rx = ri; } } /* * We found a candidate, try to allocate it backwards so * another thread racing a slaballoc() does not see the * mask in the base index position until we are done. * * We can safely zero-out any partial allocations because * the mask is only accessed from the base index. Any other * threads racing us will fail prior to us clearing the mask. */ if (rx < 0) return(NULL); for (j = nswath - 1; j >= 0; --j) { if (!atomic_cmpset_ptr((void **)&Regions[rx+j].mask, NULL, (void *)region_mask)) { while (++j < nswath) Regions[rx+j].mask = 0; break; } } /* retry */ } /* * Fill in the new slab in region ri. If the slab_size completely * fills one or more region slots we move the slab structure out of * band which should optimize the chunking (particularly for a power * of 2). */ found: region = &Regions[ri]; MASSERT(region->slab == NULL); if (slab_size >= NREGIONS_SIZE) { save = slab; slab = memalloc(sizeof(*slab), 0); bzero(slab, sizeof(*slab)); slab->chunks = save; for (j = 0; j < nswath; ++j) region[j].slab = slab; chunk_offset = 0; } else { bzero(slab, sizeof(*slab)); slab->chunks = (char *)slab + chunk_offset; } /* * Calculate the start of the chunks memory and recalculate the * actual number of chunks the slab can hold. */ maxchunks = (slab_size - chunk_offset) / chunk_size; if (maxchunks > MAXCHUNKS) maxchunks = MAXCHUNKS; /* * And fill in the rest */ slab->magic = ZALLOC_SLAB_MAGIC; slab->navail = maxchunks; slab->nmax = maxchunks; slab->slab_size = slab_size; slab->chunk_size = chunk_size; slab->zone_index = zi; slab->slgd = slgd; slab->state = UNKNOWN; slab->region = region; for (ri = 0; ri < maxchunks; ri += LONG_BITS) { if (ri + LONG_BITS <= maxchunks) slab->bitmap[ri >> LONG_BITS_SHIFT] = ULONG_MAX; else slab->bitmap[ri >> LONG_BITS_SHIFT] = (1LU << (maxchunks - ri)) - 1; } return (slab); } /* * Free a slab. */ static void slabfree(slab_t slab) { int nswath; int j; if (slab->region->slab == slab) { /* * Out-of-band slab. */ nswath = slab->slab_size / NREGIONS_SIZE; for (j = 0; j < nswath; ++j) slab->region[j].slab = NULL; slab->magic = 0; _vmem_free(slab->chunks, slab->slab_size); memfree(slab, 0); } else { /* * In-band slab. */ slab->magic = 0; _vmem_free(slab, slab->slab_size); } } /* * Terminate a slab's use in the current thread. The slab may still have * outstanding allocations and thus not be deallocatable. */ static void slabterm(slglobaldata_t slgd, slab_t slab) { slglobaldata_t sldepot = slgd->sldepot; struct zoneinfo *zinfo; slab_t dnext; int zi = slab->zone_index; slab->slgd = NULL; --slgd->nslabs; zinfo = &sldepot->zone[zi]; /* * If the slab can be freed and the depot is either locked or not * empty, then free the slab. */ if (slab->navail == slab->nmax && zinfo->avail_base) { slab->state = UNKNOWN; slabfree(slab); return; } slab->state = AVAIL; /* * Link the slab into the depot */ for (;;) { dnext = zinfo->avail_base; cpu_ccfence(); if ((void *)dnext == LOCKEDPTR) { cpu_pause(); continue; } slab->next = dnext; if (atomic_cmpset_ptr(&zinfo->avail_base, dnext, slab)) break; } } /* * _vmem_alloc() * * Directly map memory in PAGE_SIZE'd chunks with the specified * alignment. * * Alignment must be a multiple of PAGE_SIZE. * * Size must be >= alignment. */ static void * _vmem_alloc(int ri, size_t slab_size) { char *baddr = (void *)((uintptr_t)ri << NREGIONS_SHIFT); char *eaddr; char *addr; char *save; uintptr_t excess; if (slab_size < NREGIONS_SIZE) eaddr = baddr + NREGIONS_SIZE; else eaddr = baddr + slab_size; /* * This usually just works but might not. */ addr = mmap(baddr, slab_size, PROT_READ|PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_SIZEALIGN, -1, 0); if (addr == MAP_FAILED) { return (NULL); } if (addr < baddr || addr + slab_size > eaddr) { munmap(addr, slab_size); return (NULL); } /* * Check alignment. The misaligned offset is also the excess * amount. If misaligned unmap the excess so we have a chance of * mapping at the next alignment point and recursively try again. * * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB block alignment * aaaaaaaaa aaaaaaaaaaa aa mis-aligned allocation * xxxxxxxxx final excess calculation * ^ returned address */ excess = (uintptr_t)addr & (slab_size - 1); while (excess) { excess = slab_size - excess; save = addr; munmap(save + excess, slab_size - excess); addr = _vmem_alloc(ri, slab_size); munmap(save, excess); if (addr == NULL) return (NULL); if (addr < baddr || addr + slab_size > eaddr) { munmap(addr, slab_size); return (NULL); } excess = (uintptr_t)addr & (slab_size - 1); } return (addr); } /* * _vmem_free() * * Free a chunk of memory allocated with _vmem_alloc() */ static void _vmem_free(void *ptr, size_t size) { munmap(ptr, size); } /* * Panic on fatal conditions */ static void _mpanic(const char *ctl, ...) { va_list va; if (malloc_panic == 0) { malloc_panic = 1; va_start(va, ctl); vfprintf(stderr, ctl, va); fprintf(stderr, "\n"); fflush(stderr); va_end(va); } abort(); }