gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* KERN_SLABALLOC.C - Kernel SLAB memory allocator
	3	*
	4	* Copyright (c) 2003 Matthew Dillon <dillon@backplane.com>
	5	* All rights reserved.
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	* 1. Redistributions of source code must retain the above copyright
	11	* notice, this list of conditions and the following disclaimer.
	12	* 2. Redistributions in binary form must reproduce the above copyright
	13	* notice, this list of conditions and the following disclaimer in the
	14	* documentation and/or other materials provided with the distribution.
	15	*
	16	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	17	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	19	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	20	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	21	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	22	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	23	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	24	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	25	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	26	* SUCH DAMAGE.
	27	*
	28	* $DragonFly: src/sys/kern/kern_slaballoc.c,v 1.13 2003/10/20 16:09:00 dillon Exp $
	29	*
	30	* This module implements a slab allocator drop-in replacement for the
	31	* kernel malloc().
	32	*
	33	* A slab allocator reserves a ZONE for each chunk size, then lays the
	34	* chunks out in an array within the zone. Allocation and deallocation
	35	* is nearly instantanious, and fragmentation/overhead losses are limited
	36	* to a fixed worst-case amount.
	37	*
	38	* The downside of this slab implementation is in the chunk size
	39	* multiplied by the number of zones. ~80 zones * 128K = 10MB of VM per cpu.
	40	* In a kernel implementation all this memory will be physical so
	41	* the zone size is adjusted downward on machines with less physical
	42	* memory. The upside is that overhead is bounded... this is the worst
	43	* case overhead.
	44	*
	45	* Slab management is done on a per-cpu basis and no locking or mutexes
	46	* are required, only a critical section. When one cpu frees memory
	47	* belonging to another cpu's slab manager an asynchronous IPI message
	48	* will be queued to execute the operation. In addition, both the
	49	* high level slab allocator and the low level zone allocator optimize
	50	* M_ZERO requests, and the slab allocator does not have to pre initialize
	51	* the linked list of chunks.
	52	*
	53	* XXX Balancing is needed between cpus. Balance will be handled through
	54	* asynchronous IPIs primarily by reassigning the z_Cpu ownership of chunks.
	55	*
	56	* XXX If we have to allocate a new zone and M_USE_RESERVE is set, use of
	57	* the new zone should be restricted to M_USE_RESERVE requests only.
	58	*
	59	* Alloc Size Chunking Number of zones
	60	* 0-127 8 16
	61	* 128-255 16 8
	62	* 256-511 32 8
	63	* 512-1023 64 8
	64	* 1024-2047 128 8
	65	* 2048-4095 256 8
	66	* 4096-8191 512 8
	67	* 8192-16383 1024 8
	68	* 16384-32767 2048 8
	69	* (if PAGE_SIZE is 4K the maximum zone allocation is 16383)
	70	*
	71	* Allocations >= ZoneLimit go directly to kmem.
	72	*
	73	* API REQUIREMENTS AND SIDE EFFECTS
	74	*
	75	* To operate as a drop-in replacement to the FreeBSD-4.x malloc() we
	76	* have remained compatible with the following API requirements:
	77	*
	78	* + small power-of-2 sized allocations are power-of-2 aligned (kern_tty)
	79	* + all power-of-2 sized allocations are power-of-2 aligned (twe)
	80	* + malloc(0) is allowed and returns non-NULL (ahc driver)
	81	* + ability to allocate arbitrarily large chunks of memory
	82	*/
	83
	84	#include "opt_vm.h"
	85
	86	#include <sys/param.h>
	87	#include <sys/systm.h>
	88	#include <sys/kernel.h>
	89	#include <sys/slaballoc.h>
	90	#include <sys/mbuf.h>
	91	#include <sys/vmmeter.h>
	92	#include <sys/lock.h>
	93	#include <sys/thread.h>
	94	#include <sys/globaldata.h>
	95
	96	#include <vm/vm.h>
	97	#include <vm/vm_param.h>
	98	#include <vm/vm_kern.h>
	99	#include <vm/vm_extern.h>
	100	#include <vm/vm_object.h>
	101	#include <vm/pmap.h>
	102	#include <vm/vm_map.h>
	103	#include <vm/vm_page.h>
	104	#include <vm/vm_pageout.h>
	105
	106	#include <machine/cpu.h>
	107
	108	#include <sys/thread2.h>
	109
	110	#define arysize(ary) (sizeof(ary)/sizeof((ary)[0]))
	111
	112	/*
	113	* Fixed globals (not per-cpu)
	114	*/
	115	static int ZoneSize;
	116	static int ZoneLimit;
	117	static int ZonePageCount;
	118	static int ZonePageLimit;
	119	static int ZoneMask;
	120	static struct malloc_type *kmemstatistics;
	121	static struct kmemusage *kmemusage;
	122	static int32_t weirdary[16];
	123
	124	static void *kmem_slab_alloc(vm_size_t bytes, vm_offset_t align, int flags);
	125	static void kmem_slab_free(void *ptr, vm_size_t bytes);
	126
	127	/*
	128	* Misc constants. Note that allocations that are exact multiples of
	129	* PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
	130	* IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
	131	*/
	132	#define MIN_CHUNK_SIZE 8 /* in bytes */
	133	#define MIN_CHUNK_MASK (MIN_CHUNK_SIZE - 1)
	134	#define ZONE_RELS_THRESH 2 /* threshold number of zones */
	135	#define IN_SAME_PAGE_MASK (~(intptr_t)PAGE_MASK \| MIN_CHUNK_MASK)
	136
	137	/*
	138	* The WEIRD_ADDR is used as known text to copy into free objects to
	139	* try to create deterministic failure cases if the data is accessed after
	140	* free.
	141	*/
	142	#define WEIRD_ADDR 0xdeadc0de
	143	#define MAX_COPY sizeof(weirdary)
	144	#define ZERO_LENGTH_PTR ((void *)-8)
	145
	146	/*
	147	* Misc global malloc buckets
	148	*/
	149
	150	MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches");
	151	MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
	152	MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers");
	153
	154	MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
	155	MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery");
	156
	157	/*
	158	* Initialize the slab memory allocator. We have to choose a zone size based
	159	* on available physical memory. We choose a zone side which is approximately
	160	* 1/1024th of our memory, so if we have 128MB of ram we have a zone size of
	161	* 128K. The zone size is limited to the bounds set in slaballoc.h
	162	* (typically 32K min, 128K max).
	163	*/
	164	static void kmeminit(void *dummy);
	165
	166	SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL)
	167
	168	static void
	169	kmeminit(void *dummy)
	170	{
	171	vm_poff_t limsize;
	172	int usesize;
	173	int i;
	174	vm_pindex_t npg;
	175
	176	limsize = (vm_poff_t)vmstats.v_page_count * PAGE_SIZE;
	177	if (limsize > VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS)
	178	limsize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
	179
	180	usesize = (int)(limsize / 1024); /* convert to KB */
	181
	182	ZoneSize = ZALLOC_MIN_ZONE_SIZE;
	183	while (ZoneSize < ZALLOC_MAX_ZONE_SIZE && (ZoneSize << 1) < usesize)
	184	ZoneSize <<= 1;
	185	ZoneLimit = ZoneSize / 4;
	186	if (ZoneLimit > ZALLOC_ZONE_LIMIT)
	187	ZoneLimit = ZALLOC_ZONE_LIMIT;
	188	ZoneMask = ZoneSize - 1;
	189	ZonePageLimit = PAGE_SIZE * 4;
	190	ZonePageCount = ZoneSize / PAGE_SIZE;
	191
	192	npg = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / PAGE_SIZE;
	193	kmemusage = kmem_slab_alloc(npg * sizeof(struct kmemusage), PAGE_SIZE, M_ZERO);
	194
	195	for (i = 0; i < arysize(weirdary); ++i)
	196	weirdary[i] = WEIRD_ADDR;
	197
	198	if (bootverbose)
	199	printf("Slab ZoneSize set to %dKB\n", ZoneSize / 1024);
	200	}
	201
	202	/*
	203	* Initialize a malloc type tracking structure.
	204	*/
	205	void
	206	malloc_init(void *data)
	207	{
	208	struct malloc_type *type = data;
	209	vm_poff_t limsize;
	210
	211	if (type->ks_magic != M_MAGIC)
	212	panic("malloc type lacks magic");
	213
	214	if (type->ks_limit != 0)
	215	return;
	216
	217	if (vmstats.v_page_count == 0)
	218	panic("malloc_init not allowed before vm init");
	219
	220	limsize = (vm_poff_t)vmstats.v_page_count * PAGE_SIZE;
	221	if (limsize > VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS)
	222	limsize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
	223	type->ks_limit = limsize / 10;
	224
	225	type->ks_next = kmemstatistics;
	226	kmemstatistics = type;
	227	}
	228
	229	void
	230	malloc_uninit(void *data)
	231	{
	232	struct malloc_type *type = data;
	233	struct malloc_type *t;
	234	#ifdef INVARIANTS
	235	int i;
	236	long ttl;
	237	#endif
	238
	239	if (type->ks_magic != M_MAGIC)
	240	panic("malloc type lacks magic");
	241
	242	if (vmstats.v_page_count == 0)
	243	panic("malloc_uninit not allowed before vm init");
	244
	245	if (type->ks_limit == 0)
	246	panic("malloc_uninit on uninitialized type");
	247
	248	#ifdef INVARIANTS
	249	/*
	250	* memuse is only correct in aggregation. Due to memory being allocated
	251	* on one cpu and freed on another individual array entries may be
	252	* negative or positive (canceling each other out).
	253	*/
	254	for (i = ttl = 0; i < ncpus; ++i)
	255	ttl += type->ks_memuse[i];
	256	if (ttl) {
	257	printf("malloc_uninit: %ld bytes of '%s' still allocated on cpu %d\n",
	258	ttl, type->ks_shortdesc, i);
	259	}
	260	#endif
	261	if (type == kmemstatistics) {
	262	kmemstatistics = type->ks_next;
	263	} else {
	264	for (t = kmemstatistics; t->ks_next != NULL; t = t->ks_next) {
	265	if (t->ks_next == type) {
	266	t->ks_next = type->ks_next;
	267	break;
	268	}
	269	}
	270	}
	271	type->ks_next = NULL;
	272	type->ks_limit = 0;
	273	}
	274
	275	/*
	276	* Calculate the zone index for the allocation request size and set the
	277	* allocation request size to that particular zone's chunk size.
	278	*/
	279	static __inline int
	280	zoneindex(unsigned long *bytes)
	281	{
	282	unsigned int n = (unsigned int)bytes; / unsigned for shift opt */
	283	if (n < 128) {
	284	*bytes = n = (n + 7) & ~7;
	285	return(n / 8 - 1); /* 8 byte chunks, 16 zones */
	286	}
	287	if (n < 256) {
	288	*bytes = n = (n + 15) & ~15;
	289	return(n / 16 + 7);
	290	}
	291	if (n < 8192) {
	292	if (n < 512) {
	293	*bytes = n = (n + 31) & ~31;
	294	return(n / 32 + 15);
	295	}
	296	if (n < 1024) {
	297	*bytes = n = (n + 63) & ~63;
	298	return(n / 64 + 23);
	299	}
	300	if (n < 2048) {
	301	*bytes = n = (n + 127) & ~127;
	302	return(n / 128 + 31);
	303	}
	304	if (n < 4096) {
	305	*bytes = n = (n + 255) & ~255;
	306	return(n / 256 + 39);
	307	}
	308	*bytes = n = (n + 511) & ~511;
	309	return(n / 512 + 47);
	310	}
	311	#if ZALLOC_ZONE_LIMIT > 8192
	312	if (n < 16384) {
	313	*bytes = n = (n + 1023) & ~1023;
	314	return(n / 1024 + 55);
	315	}
	316	#endif
	317	#if ZALLOC_ZONE_LIMIT > 16384
	318	if (n < 32768) {
	319	*bytes = n = (n + 2047) & ~2047;
	320	return(n / 2048 + 63);
	321	}
	322	#endif
	323	panic("Unexpected byte count %d", n);
	324	return(0);
	325	}
	326
	327	/*
	328	* malloc() (SLAB ALLOCATOR)
	329	*
	330	* Allocate memory via the slab allocator. If the request is too large,
	331	* or if it page-aligned beyond a certain size, we fall back to the
	332	* KMEM subsystem. A SLAB tracking descriptor must be specified, use
	333	* &SlabMisc if you don't care.
	334	*
	335	* M_NOWAIT - return NULL instead of blocking.
	336	* M_ZERO - zero the returned memory.
	337	* M_USE_RESERVE - allocate out of the system reserve if necessary
	338	*/
	339	void *
	340	malloc(unsigned long size, struct malloc_type *type, int flags)
	341	{
	342	SLZone *z;
	343	SLChunk *chunk;
	344	SLGlobalData *slgd;
	345	struct globaldata *gd;
	346	int zi;
	347
	348	gd = mycpu;
	349	slgd = &gd->gd_slab;
	350
	351	/*
	352	* XXX silly to have this in the critical path.
	353	*/
	354	if (type->ks_limit == 0) {
	355	crit_enter();
	356	if (type->ks_limit == 0)
	357	malloc_init(type);
	358	crit_exit();
	359	}
	360	++type->ks_calls;
	361
	362	/*
	363	* Handle the case where the limit is reached. Panic if can't return
	364	* NULL. XXX the original malloc code looped, but this tended to
	365	* simply deadlock the computer.
	366	*/
	367	while (type->ks_loosememuse >= type->ks_limit) {
	368	int i;
	369	long ttl;
	370
	371	for (i = ttl = 0; i < ncpus; ++i)
	372	ttl += type->ks_memuse[i];
	373	type->ks_loosememuse = ttl;
	374	if (ttl >= type->ks_limit) {
	375	if (flags & (M_NOWAIT\|M_NULLOK))
	376	return(NULL);
	377	panic("%s: malloc limit exceeded", type->ks_shortdesc);
	378	}
	379	}
	380
	381	/*
	382	* Handle the degenerate size == 0 case. Yes, this does happen.
	383	* Return a special pointer. This is to maintain compatibility with
	384	* the original malloc implementation. Certain devices, such as the
	385	* adaptec driver, not only allocate 0 bytes, they check for NULL and
	386	* also realloc() later on. Joy.
	387	*/
	388	if (size == 0)
	389	return(ZERO_LENGTH_PTR);
	390
	391	/*
	392	* Handle hysteresis from prior frees here in malloc(). We cannot
	393	* safely manipulate the kernel_map in free() due to free() possibly
	394	* being called via an IPI message or from sensitive interrupt code.
	395	*/
	396	while (slgd->NFreeZones > ZONE_RELS_THRESH && (flags & M_NOWAIT) == 0) {
	397	crit_enter();
	398	if (slgd->NFreeZones > ZONE_RELS_THRESH) { /* crit sect race */
	399	z = slgd->FreeZones;
	400	slgd->FreeZones = z->z_Next;
	401	--slgd->NFreeZones;
	402	kmem_slab_free(z, ZoneSize); /* may block */
	403	}
	404	crit_exit();
	405	}
	406	/*
	407	* XXX handle oversized frees that were queued from free().
	408	*/
	409	while (slgd->FreeOvZones && (flags & M_NOWAIT) == 0) {
	410	crit_enter();
	411	if ((z = slgd->FreeOvZones) != NULL) {
	412	KKASSERT(z->z_Magic == ZALLOC_OVSZ_MAGIC);
	413	slgd->FreeOvZones = z->z_Next;
	414	kmem_slab_free(z, z->z_ChunkSize); /* may block */
	415	}
	416	crit_exit();
	417	}
	418
	419	/*
	420	* Handle large allocations directly. There should not be very many of
	421	* these so performance is not a big issue.
	422	*
	423	* Guarentee page alignment for allocations in multiples of PAGE_SIZE
	424	*/
	425	if (size >= ZoneLimit \|\| (size & PAGE_MASK) == 0) {
	426	struct kmemusage *kup;
	427
	428	size = round_page(size);
	429	chunk = kmem_slab_alloc(size, PAGE_SIZE, flags);
	430	if (chunk == NULL)
	431	return(NULL);
	432	flags &= ~M_ZERO; /* result already zero'd if M_ZERO was set */
	433	flags \|= M_PASSIVE_ZERO;
	434	kup = btokup(chunk);
	435	kup->ku_pagecnt = size / PAGE_SIZE;
	436	kup->ku_cpu = gd->gd_cpuid;
	437	crit_enter();
	438	goto done;
	439	}
	440
	441	/*
	442	* Attempt to allocate out of an existing zone. First try the free list,
	443	* then allocate out of unallocated space. If we find a good zone move
	444	* it to the head of the list so later allocations find it quickly
	445	* (we might have thousands of zones in the list).
	446	*
	447	* Note: zoneindex() will panic of size is too large.
	448	*/
	449	zi = zoneindex(&size);
	450	KKASSERT(zi < NZONES);
	451	crit_enter();
	452	if ((z = slgd->ZoneAry[zi]) != NULL) {
	453	KKASSERT(z->z_NFree > 0);
	454
	455	/*
	456	* Remove us from the ZoneAry[] when we become empty
	457	*/
	458	if (--z->z_NFree == 0) {
	459	slgd->ZoneAry[zi] = z->z_Next;
	460	z->z_Next = NULL;
	461	}
	462
	463	/*
	464	* Locate a chunk in a free page. This attempts to localize
	465	* reallocations into earlier pages without us having to sort
	466	* the chunk list. A chunk may still overlap a page boundary.
	467	*/
	468	while (z->z_FirstFreePg < ZonePageCount) {
	469	if ((chunk = z->z_PageAry[z->z_FirstFreePg]) != NULL) {
	470	#ifdef DIAGNOSTIC
	471	/*
	472	* Diagnostic: c_Next is not total garbage.
	473	*/
	474	KKASSERT(chunk->c_Next == NULL \|\|
	475	((intptr_t)chunk->c_Next & IN_SAME_PAGE_MASK) ==
	476	((intptr_t)chunk & IN_SAME_PAGE_MASK));
	477	#endif
	478	#ifdef INVARIANTS
	479	if ((uintptr_t)chunk < VM_MIN_KERNEL_ADDRESS)
	480	panic("chunk %p FFPG %d/%d", chunk, z->z_FirstFreePg, ZonePageCount);
	481	if (chunk->c_Next && (uintptr_t)chunk->c_Next < VM_MIN_KERNEL_ADDRESS)
	482	panic("chunkNEXT %p %p FFPG %d/%d", chunk, chunk->c_Next, z->z_FirstFreePg, ZonePageCount);
	483	#endif
	484	z->z_PageAry[z->z_FirstFreePg] = chunk->c_Next;
	485	goto done;
	486	}
	487	++z->z_FirstFreePg;
	488	}
	489
	490	/*
	491	* No chunks are available but NFree said we had some memory, so
	492	* it must be available in the never-before-used-memory area
	493	* governed by UIndex. The consequences are very serious if our zone
	494	* got corrupted so we use an explicit panic rather then a KASSERT.
	495	*/
	496	if (z->z_UIndex + 1 != z->z_NMax)
	497	z->z_UIndex = z->z_UIndex + 1;
	498	else
	499	z->z_UIndex = 0;
	500	if (z->z_UIndex == z->z_UEndIndex)
	501	panic("slaballoc: corrupted zone");
	502	chunk = (SLChunk )(z->z_BasePtr + z->z_UIndex size);
	503	if ((z->z_Flags & SLZF_UNOTZEROD) == 0) {
	504	flags &= ~M_ZERO;
	505	flags \|= M_PASSIVE_ZERO;
	506	}
	507	goto done;
	508	}
	509
	510	/*
	511	* If all zones are exhausted we need to allocate a new zone for this
	512	* index. Use M_ZERO to take advantage of pre-zerod pages. Also see
	513	* UAlloc use above in regards to M_ZERO. Note that when we are reusing
	514	* a zone from the FreeZones list UAlloc'd data will not be zero'd, and
	515	* we do not pre-zero it because we do not want to mess up the L1 cache.
	516	*
	517	* At least one subsystem, the tty code (see CROUND) expects power-of-2
	518	* allocations to be power-of-2 aligned. We maintain compatibility by
	519	* adjusting the base offset below.
	520	*/
	521	{
	522	int off;
	523
	524	if ((z = slgd->FreeZones) != NULL) {
	525	slgd->FreeZones = z->z_Next;
	526	--slgd->NFreeZones;
	527	bzero(z, sizeof(SLZone));
	528	z->z_Flags \|= SLZF_UNOTZEROD;
	529	} else {
	530	z = kmem_slab_alloc(ZoneSize, ZoneSize, flags\|M_ZERO);
	531	if (z == NULL)
	532	goto fail;
	533	}
	534
	535	/*
	536	* Guarentee power-of-2 alignment for power-of-2-sized chunks.
	537	* Otherwise just 8-byte align the data.
	538	*/
	539	if ((size \| (size - 1)) + 1 == (size << 1))
	540	off = (sizeof(SLZone) + size - 1) & ~(size - 1);
	541	else
	542	off = (sizeof(SLZone) + MIN_CHUNK_MASK) & ~MIN_CHUNK_MASK;
	543	z->z_Magic = ZALLOC_SLAB_MAGIC;
	544	z->z_ZoneIndex = zi;
	545	z->z_NMax = (ZoneSize - off) / size;
	546	z->z_NFree = z->z_NMax - 1;
	547	z->z_BasePtr = (char *)z + off;
	548	z->z_UIndex = z->z_UEndIndex = slgd->JunkIndex % z->z_NMax;
	549	z->z_ChunkSize = size;
	550	z->z_FirstFreePg = ZonePageCount;
	551	z->z_Cpu = gd->gd_cpuid;
	552	chunk = (SLChunk )(z->z_BasePtr + z->z_UIndex size);
	553	z->z_Next = slgd->ZoneAry[zi];
	554	slgd->ZoneAry[zi] = z;
	555	if ((z->z_Flags & SLZF_UNOTZEROD) == 0) {
	556	flags &= ~M_ZERO; /* already zero'd */
	557	flags \|= M_PASSIVE_ZERO;
	558	}
	559
	560	/*
	561	* Slide the base index for initial allocations out of the next
	562	* zone we create so we do not over-weight the lower part of the
	563	* cpu memory caches.
	564	*/
	565	slgd->JunkIndex = (slgd->JunkIndex + ZALLOC_SLAB_SLIDE)
	566	& (ZALLOC_MAX_ZONE_SIZE - 1);
	567	}
	568	done:
	569	++type->ks_inuse[gd->gd_cpuid];
	570	type->ks_memuse[gd->gd_cpuid] += size;
	571	type->ks_loosememuse += size;
	572	crit_exit();
	573	if (flags & M_ZERO)
	574	bzero(chunk, size);
	575	#ifdef INVARIANTS
	576	else if ((flags & (M_ZERO\|M_PASSIVE_ZERO)) == 0)
	577	chunk->c_Next = (void )-1; / avoid accidental double-free check */
	578	#endif
	579	return(chunk);
	580	fail:
	581	crit_exit();
	582	return(NULL);
	583	}
	584
	585	void *
	586	realloc(void ptr, unsigned long size, struct malloc_type type, int flags)
	587	{
	588	SLZone *z;
	589	void *nptr;
	590	unsigned long osize;
	591
	592	if (ptr == NULL \|\| ptr == ZERO_LENGTH_PTR)
	593	return(malloc(size, type, flags));
	594	if (size == 0) {
	595	free(ptr, type);
	596	return(NULL);
	597	}
	598
	599	/*
	600	* Handle oversized allocations. XXX we really should require that a
	601	* size be passed to free() instead of this nonsense.
	602	*/
	603	{
	604	struct kmemusage *kup;
	605
	606	kup = btokup(ptr);
	607	if (kup->ku_pagecnt) {
	608	osize = kup->ku_pagecnt << PAGE_SHIFT;
	609	if (osize == round_page(size))
	610	return(ptr);
	611	if ((nptr = malloc(size, type, flags)) == NULL)
	612	return(NULL);
	613	bcopy(ptr, nptr, min(size, osize));
	614	free(ptr, type);
	615	return(nptr);
	616	}
	617	}
	618
	619	/*
	620	* Get the original allocation's zone. If the new request winds up
	621	* using the same chunk size we do not have to do anything.
	622	*/
	623	z = (SLZone *)((uintptr_t)ptr & ~(uintptr_t)ZoneMask);
	624	KKASSERT(z->z_Magic == ZALLOC_SLAB_MAGIC);
	625
	626	zoneindex(&size);
	627	if (z->z_ChunkSize == size)
	628	return(ptr);
	629
	630	/*
	631	* Allocate memory for the new request size. Note that zoneindex has
	632	* already adjusted the request size to the appropriate chunk size, which
	633	* should optimize our bcopy(). Then copy and return the new pointer.
	634	*/
	635	if ((nptr = malloc(size, type, flags)) == NULL)
	636	return(NULL);
	637	bcopy(ptr, nptr, min(size, z->z_ChunkSize));
	638	free(ptr, type);
	639	return(nptr);
	640	}
	641
	642	#ifdef SMP
	643	/*
	644	* free() (SLAB ALLOCATOR)
	645	*
	646	* Free the specified chunk of memory.
	647	*/
	648	static
	649	void
	650	free_remote(void *ptr)
	651	{
	652	free(ptr, (struct malloc_type *)ptr);
	653	}
	654
	655	#endif
	656
	657	void
	658	free(void ptr, struct malloc_type type)
	659	{
	660	SLZone *z;
	661	SLChunk *chunk;
	662	SLGlobalData *slgd;
	663	struct globaldata *gd;
	664	int pgno;
	665
	666	gd = mycpu;
	667	slgd = &gd->gd_slab;
	668
	669	/*
	670	* Handle special 0-byte allocations
	671	*/
	672	if (ptr == ZERO_LENGTH_PTR)
	673	return;
	674
	675	/*
	676	* Handle oversized allocations. XXX we really should require that a
	677	* size be passed to free() instead of this nonsense.
	678	*
	679	* This code is never called via an ipi.
	680	*/
	681	{
	682	struct kmemusage *kup;
	683	unsigned long size;
	684
	685	kup = btokup(ptr);
	686	if (kup->ku_pagecnt) {
	687	size = kup->ku_pagecnt << PAGE_SHIFT;
	688	kup->ku_pagecnt = 0;
	689	#ifdef INVARIANTS
	690	KKASSERT(sizeof(weirdary) <= size);
	691	bcopy(weirdary, ptr, sizeof(weirdary));
	692	#endif
	693	/*
	694	* note: we always adjust our cpu's slot, not the originating
	695	* cpu (kup->ku_cpuid). The statistics are in aggregate.
	696	*/
	697	crit_enter();
	698	--type->ks_inuse[gd->gd_cpuid];
	699	type->ks_memuse[gd->gd_cpuid] -= size;
	700	if (mycpu->gd_intr_nesting_level) {
	701	z = (SLZone *)ptr;
	702	z->z_Magic = ZALLOC_OVSZ_MAGIC;
	703	z->z_Next = slgd->FreeOvZones;
	704	z->z_ChunkSize = size;
	705	slgd->FreeOvZones = z;
	706	crit_exit();
	707	} else {
	708	crit_exit();
	709	kmem_slab_free(ptr, size); /* may block */
	710	}
	711	return;
	712	}
	713	}
	714
	715	/*
	716	* Zone case. Figure out the zone based on the fact that it is
	717	* ZoneSize aligned.
	718	*/
	719	z = (SLZone *)((uintptr_t)ptr & ~(uintptr_t)ZoneMask);
	720	KKASSERT(z->z_Magic == ZALLOC_SLAB_MAGIC);
	721
	722	/*
	723	* If we do not own the zone then forward the request to the
	724	* cpu that does. The freeing code does not need the byte count
	725	* unless DIAGNOSTIC is set.
	726	*/
	727	if (z->z_Cpu != gd->gd_cpuid) {
	728	(struct malloc_type *)ptr = type;
	729	#ifdef SMP
	730	lwkt_send_ipiq(z->z_Cpu, free_remote, ptr);
	731	#else
	732	panic("Corrupt SLZone");
	733	#endif
	734	return;
	735	}
	736
	737	if (type->ks_magic != M_MAGIC)
	738	panic("free: malloc type lacks magic");
	739
	740	crit_enter();
	741	pgno = ((char )ptr - (char )z) >> PAGE_SHIFT;
	742	chunk = ptr;
	743
	744	#ifdef INVARIANTS
	745	/*
	746	* Attempt to detect a double-free. To reduce overhead we only check
	747	* if there appears to be link pointer at the base of the data.
	748	*/
	749	if (((intptr_t)chunk->c_Next - (intptr_t)z) >> PAGE_SHIFT == pgno) {
	750	SLChunk *scan;
	751	for (scan = z->z_PageAry[pgno]; scan; scan = scan->c_Next) {
	752	if (scan == chunk)
	753	panic("Double free at %p", chunk);
	754	}
	755	}
	756	#endif
	757
	758	/*
	759	* Put weird data into the memory to detect modifications after freeing,
	760	* illegal pointer use after freeing (we should fault on the odd address),
	761	* and so forth. XXX needs more work, see the old malloc code.
	762	*/
	763	#ifdef INVARIANTS
	764	if (z->z_ChunkSize < sizeof(weirdary))
	765	bcopy(weirdary, chunk, z->z_ChunkSize);
	766	else
	767	bcopy(weirdary, chunk, sizeof(weirdary));
	768	#endif
	769
	770	/*
	771	* Add this free non-zero'd chunk to a linked list for reuse, adjust
	772	* z_FirstFreePg.
	773	*/
	774	#ifdef INVARIANTS
	775	if ((uintptr_t)chunk < VM_MIN_KERNEL_ADDRESS)
	776	panic("BADFREE %p\n", chunk);
	777	#endif
	778	chunk->c_Next = z->z_PageAry[pgno];
	779	z->z_PageAry[pgno] = chunk;
	780	#ifdef INVARIANTS
	781	if (chunk->c_Next && (uintptr_t)chunk->c_Next < VM_MIN_KERNEL_ADDRESS)
	782	panic("BADFREE2");
	783	#endif
	784	if (z->z_FirstFreePg > pgno)
	785	z->z_FirstFreePg = pgno;
	786
	787	/*
	788	* Bump the number of free chunks. If it becomes non-zero the zone
	789	* must be added back onto the appropriate list.
	790	*/
	791	if (z->z_NFree++ == 0) {
	792	z->z_Next = slgd->ZoneAry[z->z_ZoneIndex];
	793	slgd->ZoneAry[z->z_ZoneIndex] = z;
	794	}
	795
	796	--type->ks_inuse[z->z_Cpu];
	797	type->ks_memuse[z->z_Cpu] -= z->z_ChunkSize;
	798
	799	/*
	800	* If the zone becomes totally free, and there are other zones we
	801	* can allocate from, move this zone to the FreeZones list. Since
	802	* this code can be called from an IPI callback, do NOT try to mess
	803	* with kernel_map here. Hysteresis will be performed at malloc() time.
	804	*/
	805	if (z->z_NFree == z->z_NMax &&
	806	(z->z_Next \|\| slgd->ZoneAry[z->z_ZoneIndex] != z)
	807	) {
	808	SLZone **pz;
	809
	810	for (pz = &slgd->ZoneAry[z->z_ZoneIndex]; z != pz; pz = &(pz)->z_Next)
	811	;
	812	*pz = z->z_Next;
	813	z->z_Magic = -1;
	814	z->z_Next = slgd->FreeZones;
	815	slgd->FreeZones = z;
	816	++slgd->NFreeZones;
	817	}
	818	crit_exit();
	819	}
	820
	821	/*
	822	* kmem_slab_alloc()
	823	*
	824	* Directly allocate and wire kernel memory in PAGE_SIZE chunks with the
	825	* specified alignment. M_* flags are expected in the flags field.
	826	*
	827	* Alignment must be a multiple of PAGE_SIZE.
	828	*
	829	* NOTE! XXX For the moment we use vm_map_entry_reserve/release(),
	830	* but when we move zalloc() over to use this function as its backend
	831	* we will have to switch to kreserve/krelease and call reserve(0)
	832	* after the new space is made available.
	833	*/
	834	static void *
	835	kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags)
	836	{
	837	vm_size_t i;
	838	vm_offset_t addr;
	839	vm_offset_t offset;
	840	int count;
	841	vm_map_t map = kernel_map;
	842
	843	size = round_page(size);
	844	addr = vm_map_min(map);
	845
	846	/*
	847	* Reserve properly aligned space from kernel_map
	848	*/
	849	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	850	crit_enter();
	851	vm_map_lock(map);
	852	if (vm_map_findspace(map, vm_map_min(map), size, align, &addr)) {
	853	vm_map_unlock(map);
	854	if ((flags & (M_NOWAIT\|M_NULLOK)) == 0)
	855	panic("kmem_slab_alloc(): kernel_map ran out of space!");
	856	crit_exit();
	857	vm_map_entry_release(count);
	858	return(NULL);
	859	}
	860	offset = addr - VM_MIN_KERNEL_ADDRESS;
	861	vm_object_reference(kernel_object);
	862	vm_map_insert(map, &count,
	863	kernel_object, offset, addr, addr + size,
	864	VM_PROT_ALL, VM_PROT_ALL, 0);
	865
	866	/*
	867	* Allocate the pages. Do not mess with the PG_ZERO flag yet.
	868	*/
	869	for (i = 0; i < size; i += PAGE_SIZE) {
	870	vm_page_t m;
	871	vm_pindex_t idx = OFF_TO_IDX(offset + i);
	872	int zero = (flags & M_ZERO) ? VM_ALLOC_ZERO : 0;
	873
	874	if ((flags & (M_NOWAIT\|M_USE_RESERVE)) == M_NOWAIT)
	875	m = vm_page_alloc(kernel_object, idx, VM_ALLOC_INTERRUPT\|zero);
	876	else
	877	m = vm_page_alloc(kernel_object, idx, VM_ALLOC_SYSTEM\|zero);
	878	if (m == NULL) {
	879	if ((flags & M_NOWAIT) == 0) {
	880	vm_map_unlock(map);
	881	vm_wait();
	882	vm_map_lock(map);
	883	i -= PAGE_SIZE; /* retry */
	884	continue;
	885	}
	886	while (i != 0) {
	887	i -= PAGE_SIZE;
	888	m = vm_page_lookup(kernel_object, OFF_TO_IDX(offset + i));
	889	vm_page_free(m);
	890	}
	891	vm_map_delete(map, addr, addr + size, &count);
	892	vm_map_unlock(map);
	893	crit_exit();
	894	vm_map_entry_release(count);
	895	return(NULL);
	896	}
	897	}
	898
	899	/*
	900	* Mark the map entry as non-pageable using a routine that allows us to
	901	* populate the underlying pages.
	902	*/
	903	vm_map_set_wired_quick(map, addr, size, &count);
	904	crit_exit();
	905
	906	/*
	907	* Enter the pages into the pmap and deal with PG_ZERO and M_ZERO.
	908	*/
	909	for (i = 0; i < size; i += PAGE_SIZE) {
	910	vm_page_t m;
	911
	912	m = vm_page_lookup(kernel_object, OFF_TO_IDX(offset + i));
	913	m->valid = VM_PAGE_BITS_ALL;
	914	vm_page_wire(m);
	915	vm_page_wakeup(m);
	916	pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL, 1);
	917	if ((m->flags & PG_ZERO) == 0 && (flags & M_ZERO))
	918	bzero((char *)addr + i, PAGE_SIZE);
	919	vm_page_flag_clear(m, PG_ZERO);
	920	vm_page_flag_set(m, PG_MAPPED \| PG_WRITEABLE \| PG_REFERENCED);
	921	}
	922	vm_map_unlock(map);
	923	vm_map_entry_release(count);
	924	return((void *)addr);
	925	}
	926
	927	static void
	928	kmem_slab_free(void *ptr, vm_size_t size)
	929	{
	930	crit_enter();
	931	vm_map_remove(kernel_map, (vm_offset_t)ptr, (vm_offset_t)ptr + size);
	932	crit_exit();
	933	}
	934