gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* KERN_SLABALLOC.C - Kernel SLAB memory allocator
	3	*
	4	* Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
	5	*
	6	* This code is derived from software contributed to The DragonFly Project
	7	* by Matthew Dillon <dillon@backplane.com>
	8	*
	9	* Redistribution and use in source and binary forms, with or without
	10	* modification, are permitted provided that the following conditions
	11	* are met:
	12	*
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in
	17	* the documentation and/or other materials provided with the
	18	* distribution.
	19	* 3. Neither the name of The DragonFly Project nor the names of its
	20	* contributors may be used to endorse or promote products derived
	21	* from this software without specific, prior written permission.
	22	*
	23	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	24	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	25	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	26	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	27	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	28	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	29	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	30	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	31	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	32	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	33	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	34	* SUCH DAMAGE.
	35	*
	36	* $DragonFly: src/sys/kern/kern_slaballoc.c,v 1.27 2005/03/28 18:49:25 joerg Exp $
	37	*
	38	* This module implements a slab allocator drop-in replacement for the
	39	* kernel malloc().
	40	*
	41	* A slab allocator reserves a ZONE for each chunk size, then lays the
	42	* chunks out in an array within the zone. Allocation and deallocation
	43	* is nearly instantanious, and fragmentation/overhead losses are limited
	44	* to a fixed worst-case amount.
	45	*
	46	* The downside of this slab implementation is in the chunk size
	47	* multiplied by the number of zones. ~80 zones * 128K = 10MB of VM per cpu.
	48	* In a kernel implementation all this memory will be physical so
	49	* the zone size is adjusted downward on machines with less physical
	50	* memory. The upside is that overhead is bounded... this is the worst
	51	* case overhead.
	52	*
	53	* Slab management is done on a per-cpu basis and no locking or mutexes
	54	* are required, only a critical section. When one cpu frees memory
	55	* belonging to another cpu's slab manager an asynchronous IPI message
	56	* will be queued to execute the operation. In addition, both the
	57	* high level slab allocator and the low level zone allocator optimize
	58	* M_ZERO requests, and the slab allocator does not have to pre initialize
	59	* the linked list of chunks.
	60	*
	61	* XXX Balancing is needed between cpus. Balance will be handled through
	62	* asynchronous IPIs primarily by reassigning the z_Cpu ownership of chunks.
	63	*
	64	* XXX If we have to allocate a new zone and M_USE_RESERVE is set, use of
	65	* the new zone should be restricted to M_USE_RESERVE requests only.
	66	*
	67	* Alloc Size Chunking Number of zones
	68	* 0-127 8 16
	69	* 128-255 16 8
	70	* 256-511 32 8
	71	* 512-1023 64 8
	72	* 1024-2047 128 8
	73	* 2048-4095 256 8
	74	* 4096-8191 512 8
	75	* 8192-16383 1024 8
	76	* 16384-32767 2048 8
	77	* (if PAGE_SIZE is 4K the maximum zone allocation is 16383)
	78	*
	79	* Allocations >= ZoneLimit go directly to kmem.
	80	*
	81	* API REQUIREMENTS AND SIDE EFFECTS
	82	*
	83	* To operate as a drop-in replacement to the FreeBSD-4.x malloc() we
	84	* have remained compatible with the following API requirements:
	85	*
	86	* + small power-of-2 sized allocations are power-of-2 aligned (kern_tty)
	87	* + all power-of-2 sized allocations are power-of-2 aligned (twe)
	88	* + malloc(0) is allowed and returns non-NULL (ahc driver)
	89	* + ability to allocate arbitrarily large chunks of memory
	90	*/
	91
	92	#include "opt_vm.h"
	93
	94	#include <sys/param.h>
	95	#include <sys/systm.h>
	96	#include <sys/kernel.h>
	97	#include <sys/slaballoc.h>
	98	#include <sys/mbuf.h>
	99	#include <sys/vmmeter.h>
	100	#include <sys/lock.h>
	101	#include <sys/thread.h>
	102	#include <sys/globaldata.h>
	103
	104	#include <vm/vm.h>
	105	#include <vm/vm_param.h>
	106	#include <vm/vm_kern.h>
	107	#include <vm/vm_extern.h>
	108	#include <vm/vm_object.h>
	109	#include <vm/pmap.h>
	110	#include <vm/vm_map.h>
	111	#include <vm/vm_page.h>
	112	#include <vm/vm_pageout.h>
	113
	114	#include <machine/cpu.h>
	115
	116	#include <sys/thread2.h>
	117
	118	#define arysize(ary) (sizeof(ary)/sizeof((ary)[0]))
	119
	120	/*
	121	* Fixed globals (not per-cpu)
	122	*/
	123	static int ZoneSize;
	124	static int ZoneLimit;
	125	static int ZonePageCount;
	126	static int ZoneMask;
	127	static struct malloc_type *kmemstatistics;
	128	static struct kmemusage *kmemusage;
	129	static int32_t weirdary[16];
	130
	131	static void *kmem_slab_alloc(vm_size_t bytes, vm_offset_t align, int flags);
	132	static void kmem_slab_free(void *ptr, vm_size_t bytes);
	133
	134	/*
	135	* Misc constants. Note that allocations that are exact multiples of
	136	* PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
	137	* IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
	138	*/
	139	#define MIN_CHUNK_SIZE 8 /* in bytes */
	140	#define MIN_CHUNK_MASK (MIN_CHUNK_SIZE - 1)
	141	#define ZONE_RELS_THRESH 2 /* threshold number of zones */
	142	#define IN_SAME_PAGE_MASK (~(intptr_t)PAGE_MASK \| MIN_CHUNK_MASK)
	143
	144	/*
	145	* The WEIRD_ADDR is used as known text to copy into free objects to
	146	* try to create deterministic failure cases if the data is accessed after
	147	* free.
	148	*/
	149	#define WEIRD_ADDR 0xdeadc0de
	150	#define MAX_COPY sizeof(weirdary)
	151	#define ZERO_LENGTH_PTR ((void *)-8)
	152
	153	/*
	154	* Misc global malloc buckets
	155	*/
	156
	157	MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches");
	158	MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
	159	MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers");
	160
	161	MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
	162	MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery");
	163
	164	/*
	165	* Initialize the slab memory allocator. We have to choose a zone size based
	166	* on available physical memory. We choose a zone side which is approximately
	167	* 1/1024th of our memory, so if we have 128MB of ram we have a zone size of
	168	* 128K. The zone size is limited to the bounds set in slaballoc.h
	169	* (typically 32K min, 128K max).
	170	*/
	171	static void kmeminit(void *dummy);
	172
	173	SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL)
	174
	175	static void
	176	kmeminit(void *dummy)
	177	{
	178	vm_poff_t limsize;
	179	int usesize;
	180	int i;
	181	vm_pindex_t npg;
	182
	183	limsize = (vm_poff_t)vmstats.v_page_count * PAGE_SIZE;
	184	if (limsize > VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS)
	185	limsize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
	186
	187	usesize = (int)(limsize / 1024); /* convert to KB */
	188
	189	ZoneSize = ZALLOC_MIN_ZONE_SIZE;
	190	while (ZoneSize < ZALLOC_MAX_ZONE_SIZE && (ZoneSize << 1) < usesize)
	191	ZoneSize <<= 1;
	192	ZoneLimit = ZoneSize / 4;
	193	if (ZoneLimit > ZALLOC_ZONE_LIMIT)
	194	ZoneLimit = ZALLOC_ZONE_LIMIT;
	195	ZoneMask = ZoneSize - 1;
	196	ZonePageCount = ZoneSize / PAGE_SIZE;
	197
	198	npg = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / PAGE_SIZE;
	199	kmemusage = kmem_slab_alloc(npg * sizeof(struct kmemusage), PAGE_SIZE, M_WAITOK\|M_ZERO);
	200
	201	for (i = 0; i < arysize(weirdary); ++i)
	202	weirdary[i] = WEIRD_ADDR;
	203
	204	if (bootverbose)
	205	printf("Slab ZoneSize set to %dKB\n", ZoneSize / 1024);
	206	}
	207
	208	/*
	209	* Initialize a malloc type tracking structure.
	210	*/
	211	void
	212	malloc_init(void *data)
	213	{
	214	struct malloc_type *type = data;
	215	vm_poff_t limsize;
	216
	217	if (type->ks_magic != M_MAGIC)
	218	panic("malloc type lacks magic");
	219
	220	if (type->ks_limit != 0)
	221	return;
	222
	223	if (vmstats.v_page_count == 0)
	224	panic("malloc_init not allowed before vm init");
	225
	226	limsize = (vm_poff_t)vmstats.v_page_count * PAGE_SIZE;
	227	if (limsize > VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS)
	228	limsize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
	229	type->ks_limit = limsize / 10;
	230
	231	type->ks_next = kmemstatistics;
	232	kmemstatistics = type;
	233	}
	234
	235	void
	236	malloc_uninit(void *data)
	237	{
	238	struct malloc_type *type = data;
	239	struct malloc_type *t;
	240	#ifdef INVARIANTS
	241	int i;
	242	long ttl;
	243	#endif
	244
	245	if (type->ks_magic != M_MAGIC)
	246	panic("malloc type lacks magic");
	247
	248	if (vmstats.v_page_count == 0)
	249	panic("malloc_uninit not allowed before vm init");
	250
	251	if (type->ks_limit == 0)
	252	panic("malloc_uninit on uninitialized type");
	253
	254	#ifdef INVARIANTS
	255	/*
	256	* memuse is only correct in aggregation. Due to memory being allocated
	257	* on one cpu and freed on another individual array entries may be
	258	* negative or positive (canceling each other out).
	259	*/
	260	for (i = ttl = 0; i < ncpus; ++i)
	261	ttl += type->ks_memuse[i];
	262	if (ttl) {
	263	printf("malloc_uninit: %ld bytes of '%s' still allocated on cpu %d\n",
	264	ttl, type->ks_shortdesc, i);
	265	}
	266	#endif
	267	if (type == kmemstatistics) {
	268	kmemstatistics = type->ks_next;
	269	} else {
	270	for (t = kmemstatistics; t->ks_next != NULL; t = t->ks_next) {
	271	if (t->ks_next == type) {
	272	t->ks_next = type->ks_next;
	273	break;
	274	}
	275	}
	276	}
	277	type->ks_next = NULL;
	278	type->ks_limit = 0;
	279	}
	280
	281	/*
	282	* Calculate the zone index for the allocation request size and set the
	283	* allocation request size to that particular zone's chunk size.
	284	*/
	285	static __inline int
	286	zoneindex(unsigned long *bytes)
	287	{
	288	unsigned int n = (unsigned int)bytes; / unsigned for shift opt */
	289	if (n < 128) {
	290	*bytes = n = (n + 7) & ~7;
	291	return(n / 8 - 1); /* 8 byte chunks, 16 zones */
	292	}
	293	if (n < 256) {
	294	*bytes = n = (n + 15) & ~15;
	295	return(n / 16 + 7);
	296	}
	297	if (n < 8192) {
	298	if (n < 512) {
	299	*bytes = n = (n + 31) & ~31;
	300	return(n / 32 + 15);
	301	}
	302	if (n < 1024) {
	303	*bytes = n = (n + 63) & ~63;
	304	return(n / 64 + 23);
	305	}
	306	if (n < 2048) {
	307	*bytes = n = (n + 127) & ~127;
	308	return(n / 128 + 31);
	309	}
	310	if (n < 4096) {
	311	*bytes = n = (n + 255) & ~255;
	312	return(n / 256 + 39);
	313	}
	314	*bytes = n = (n + 511) & ~511;
	315	return(n / 512 + 47);
	316	}
	317	#if ZALLOC_ZONE_LIMIT > 8192
	318	if (n < 16384) {
	319	*bytes = n = (n + 1023) & ~1023;
	320	return(n / 1024 + 55);
	321	}
	322	#endif
	323	#if ZALLOC_ZONE_LIMIT > 16384
	324	if (n < 32768) {
	325	*bytes = n = (n + 2047) & ~2047;
	326	return(n / 2048 + 63);
	327	}
	328	#endif
	329	panic("Unexpected byte count %d", n);
	330	return(0);
	331	}
	332
	333	/*
	334	* malloc() (SLAB ALLOCATOR)
	335	*
	336	* Allocate memory via the slab allocator. If the request is too large,
	337	* or if it page-aligned beyond a certain size, we fall back to the
	338	* KMEM subsystem. A SLAB tracking descriptor must be specified, use
	339	* &SlabMisc if you don't care.
	340	*
	341	* M_RNOWAIT - don't block.
	342	* M_NULLOK - return NULL instead of blocking.
	343	* M_ZERO - zero the returned memory.
	344	* M_USE_RESERVE - allow greater drawdown of the free list
	345	* M_USE_INTERRUPT_RESERVE - allow the freelist to be exhausted
	346	*/
	347	void *
	348	malloc(unsigned long size, struct malloc_type *type, int flags)
	349	{
	350	SLZone *z;
	351	SLChunk *chunk;
	352	SLGlobalData *slgd;
	353	struct globaldata *gd;
	354	int zi;
	355
	356	gd = mycpu;
	357	slgd = &gd->gd_slab;
	358
	359	/*
	360	* XXX silly to have this in the critical path.
	361	*/
	362	if (type->ks_limit == 0) {
	363	crit_enter();
	364	if (type->ks_limit == 0)
	365	malloc_init(type);
	366	crit_exit();
	367	}
	368	++type->ks_calls;
	369
	370	/*
	371	* Handle the case where the limit is reached. Panic if can't return
	372	* NULL. XXX the original malloc code looped, but this tended to
	373	* simply deadlock the computer.
	374	*/
	375	while (type->ks_loosememuse >= type->ks_limit) {
	376	int i;
	377	long ttl;
	378
	379	for (i = ttl = 0; i < ncpus; ++i)
	380	ttl += type->ks_memuse[i];
	381	type->ks_loosememuse = ttl;
	382	if (ttl >= type->ks_limit) {
	383	if (flags & M_NULLOK)
	384	return(NULL);
	385	panic("%s: malloc limit exceeded", type->ks_shortdesc);
	386	}
	387	}
	388
	389	/*
	390	* Handle the degenerate size == 0 case. Yes, this does happen.
	391	* Return a special pointer. This is to maintain compatibility with
	392	* the original malloc implementation. Certain devices, such as the
	393	* adaptec driver, not only allocate 0 bytes, they check for NULL and
	394	* also realloc() later on. Joy.
	395	*/
	396	if (size == 0)
	397	return(ZERO_LENGTH_PTR);
	398
	399	/*
	400	* Handle hysteresis from prior frees here in malloc(). We cannot
	401	* safely manipulate the kernel_map in free() due to free() possibly
	402	* being called via an IPI message or from sensitive interrupt code.
	403	*/
	404	while (slgd->NFreeZones > ZONE_RELS_THRESH && (flags & M_RNOWAIT) == 0) {
	405	crit_enter();
	406	if (slgd->NFreeZones > ZONE_RELS_THRESH) { /* crit sect race */
	407	z = slgd->FreeZones;
	408	slgd->FreeZones = z->z_Next;
	409	--slgd->NFreeZones;
	410	kmem_slab_free(z, ZoneSize); /* may block */
	411	}
	412	crit_exit();
	413	}
	414	/*
	415	* XXX handle oversized frees that were queued from free().
	416	*/
	417	while (slgd->FreeOvZones && (flags & M_RNOWAIT) == 0) {
	418	crit_enter();
	419	if ((z = slgd->FreeOvZones) != NULL) {
	420	KKASSERT(z->z_Magic == ZALLOC_OVSZ_MAGIC);
	421	slgd->FreeOvZones = z->z_Next;
	422	kmem_slab_free(z, z->z_ChunkSize); /* may block */
	423	}
	424	crit_exit();
	425	}
	426
	427	/*
	428	* Handle large allocations directly. There should not be very many of
	429	* these so performance is not a big issue.
	430	*
	431	* Guarentee page alignment for allocations in multiples of PAGE_SIZE
	432	*/
	433	if (size >= ZoneLimit \|\| (size & PAGE_MASK) == 0) {
	434	struct kmemusage *kup;
	435
	436	size = round_page(size);
	437	chunk = kmem_slab_alloc(size, PAGE_SIZE, flags);
	438	if (chunk == NULL)
	439	return(NULL);
	440	flags &= ~M_ZERO; /* result already zero'd if M_ZERO was set */
	441	flags \|= M_PASSIVE_ZERO;
	442	kup = btokup(chunk);
	443	kup->ku_pagecnt = size / PAGE_SIZE;
	444	kup->ku_cpu = gd->gd_cpuid;
	445	crit_enter();
	446	goto done;
	447	}
	448
	449	/*
	450	* Attempt to allocate out of an existing zone. First try the free list,
	451	* then allocate out of unallocated space. If we find a good zone move
	452	* it to the head of the list so later allocations find it quickly
	453	* (we might have thousands of zones in the list).
	454	*
	455	* Note: zoneindex() will panic of size is too large.
	456	*/
	457	zi = zoneindex(&size);
	458	KKASSERT(zi < NZONES);
	459	crit_enter();
	460	if ((z = slgd->ZoneAry[zi]) != NULL) {
	461	KKASSERT(z->z_NFree > 0);
	462
	463	/*
	464	* Remove us from the ZoneAry[] when we become empty
	465	*/
	466	if (--z->z_NFree == 0) {
	467	slgd->ZoneAry[zi] = z->z_Next;
	468	z->z_Next = NULL;
	469	}
	470
	471	/*
	472	* Locate a chunk in a free page. This attempts to localize
	473	* reallocations into earlier pages without us having to sort
	474	* the chunk list. A chunk may still overlap a page boundary.
	475	*/
	476	while (z->z_FirstFreePg < ZonePageCount) {
	477	if ((chunk = z->z_PageAry[z->z_FirstFreePg]) != NULL) {
	478	#ifdef DIAGNOSTIC
	479	/*
	480	* Diagnostic: c_Next is not total garbage.
	481	*/
	482	KKASSERT(chunk->c_Next == NULL \|\|
	483	((intptr_t)chunk->c_Next & IN_SAME_PAGE_MASK) ==
	484	((intptr_t)chunk & IN_SAME_PAGE_MASK));
	485	#endif
	486	#ifdef INVARIANTS
	487	if ((uintptr_t)chunk < VM_MIN_KERNEL_ADDRESS)
	488	panic("chunk %p FFPG %d/%d", chunk, z->z_FirstFreePg, ZonePageCount);
	489	if (chunk->c_Next && (uintptr_t)chunk->c_Next < VM_MIN_KERNEL_ADDRESS)
	490	panic("chunkNEXT %p %p FFPG %d/%d", chunk, chunk->c_Next, z->z_FirstFreePg, ZonePageCount);
	491	#endif
	492	z->z_PageAry[z->z_FirstFreePg] = chunk->c_Next;
	493	goto done;
	494	}
	495	++z->z_FirstFreePg;
	496	}
	497
	498	/*
	499	* No chunks are available but NFree said we had some memory, so
	500	* it must be available in the never-before-used-memory area
	501	* governed by UIndex. The consequences are very serious if our zone
	502	* got corrupted so we use an explicit panic rather then a KASSERT.
	503	*/
	504	if (z->z_UIndex + 1 != z->z_NMax)
	505	z->z_UIndex = z->z_UIndex + 1;
	506	else
	507	z->z_UIndex = 0;
	508	if (z->z_UIndex == z->z_UEndIndex)
	509	panic("slaballoc: corrupted zone");
	510	chunk = (SLChunk )(z->z_BasePtr + z->z_UIndex size);
	511	if ((z->z_Flags & SLZF_UNOTZEROD) == 0) {
	512	flags &= ~M_ZERO;
	513	flags \|= M_PASSIVE_ZERO;
	514	}
	515	goto done;
	516	}
	517
	518	/*
	519	* If all zones are exhausted we need to allocate a new zone for this
	520	* index. Use M_ZERO to take advantage of pre-zerod pages. Also see
	521	* UAlloc use above in regards to M_ZERO. Note that when we are reusing
	522	* a zone from the FreeZones list UAlloc'd data will not be zero'd, and
	523	* we do not pre-zero it because we do not want to mess up the L1 cache.
	524	*
	525	* At least one subsystem, the tty code (see CROUND) expects power-of-2
	526	* allocations to be power-of-2 aligned. We maintain compatibility by
	527	* adjusting the base offset below.
	528	*/
	529	{
	530	int off;
	531
	532	if ((z = slgd->FreeZones) != NULL) {
	533	slgd->FreeZones = z->z_Next;
	534	--slgd->NFreeZones;
	535	bzero(z, sizeof(SLZone));
	536	z->z_Flags \|= SLZF_UNOTZEROD;
	537	} else {
	538	z = kmem_slab_alloc(ZoneSize, ZoneSize, flags\|M_ZERO);
	539	if (z == NULL)
	540	goto fail;
	541	}
	542
	543	/*
	544	* Guarentee power-of-2 alignment for power-of-2-sized chunks.
	545	* Otherwise just 8-byte align the data.
	546	*/
	547	if ((size \| (size - 1)) + 1 == (size << 1))
	548	off = (sizeof(SLZone) + size - 1) & ~(size - 1);
	549	else
	550	off = (sizeof(SLZone) + MIN_CHUNK_MASK) & ~MIN_CHUNK_MASK;
	551	z->z_Magic = ZALLOC_SLAB_MAGIC;
	552	z->z_ZoneIndex = zi;
	553	z->z_NMax = (ZoneSize - off) / size;
	554	z->z_NFree = z->z_NMax - 1;
	555	z->z_BasePtr = (char *)z + off;
	556	z->z_UIndex = z->z_UEndIndex = slgd->JunkIndex % z->z_NMax;
	557	z->z_ChunkSize = size;
	558	z->z_FirstFreePg = ZonePageCount;
	559	z->z_CpuGd = gd;
	560	z->z_Cpu = gd->gd_cpuid;
	561	chunk = (SLChunk )(z->z_BasePtr + z->z_UIndex size);
	562	z->z_Next = slgd->ZoneAry[zi];
	563	slgd->ZoneAry[zi] = z;
	564	if ((z->z_Flags & SLZF_UNOTZEROD) == 0) {
	565	flags &= ~M_ZERO; /* already zero'd */
	566	flags \|= M_PASSIVE_ZERO;
	567	}
	568
	569	/*
	570	* Slide the base index for initial allocations out of the next
	571	* zone we create so we do not over-weight the lower part of the
	572	* cpu memory caches.
	573	*/
	574	slgd->JunkIndex = (slgd->JunkIndex + ZALLOC_SLAB_SLIDE)
	575	& (ZALLOC_MAX_ZONE_SIZE - 1);
	576	}
	577	done:
	578	++type->ks_inuse[gd->gd_cpuid];
	579	type->ks_memuse[gd->gd_cpuid] += size;
	580	type->ks_loosememuse += size;
	581	crit_exit();
	582	if (flags & M_ZERO)
	583	bzero(chunk, size);
	584	#ifdef INVARIANTS
	585	else if ((flags & (M_ZERO\|M_PASSIVE_ZERO)) == 0)
	586	chunk->c_Next = (void )-1; / avoid accidental double-free check */
	587	#endif
	588	return(chunk);
	589	fail:
	590	crit_exit();
	591	return(NULL);
	592	}
	593
	594	void *
	595	realloc(void ptr, unsigned long size, struct malloc_type type, int flags)
	596	{
	597	SLZone *z;
	598	void *nptr;
	599	unsigned long osize;
	600
	601	KKASSERT((flags & M_ZERO) == 0); /* not supported */
	602
	603	if (ptr == NULL \|\| ptr == ZERO_LENGTH_PTR)
	604	return(malloc(size, type, flags));
	605	if (size == 0) {
	606	free(ptr, type);
	607	return(NULL);
	608	}
	609
	610	/*
	611	* Handle oversized allocations. XXX we really should require that a
	612	* size be passed to free() instead of this nonsense.
	613	*/
	614	{
	615	struct kmemusage *kup;
	616
	617	kup = btokup(ptr);
	618	if (kup->ku_pagecnt) {
	619	osize = kup->ku_pagecnt << PAGE_SHIFT;
	620	if (osize == round_page(size))
	621	return(ptr);
	622	if ((nptr = malloc(size, type, flags)) == NULL)
	623	return(NULL);
	624	bcopy(ptr, nptr, min(size, osize));
	625	free(ptr, type);
	626	return(nptr);
	627	}
	628	}
	629
	630	/*
	631	* Get the original allocation's zone. If the new request winds up
	632	* using the same chunk size we do not have to do anything.
	633	*/
	634	z = (SLZone *)((uintptr_t)ptr & ~(uintptr_t)ZoneMask);
	635	KKASSERT(z->z_Magic == ZALLOC_SLAB_MAGIC);
	636
	637	zoneindex(&size);
	638	if (z->z_ChunkSize == size)
	639	return(ptr);
	640
	641	/*
	642	* Allocate memory for the new request size. Note that zoneindex has
	643	* already adjusted the request size to the appropriate chunk size, which
	644	* should optimize our bcopy(). Then copy and return the new pointer.
	645	*/
	646	if ((nptr = malloc(size, type, flags)) == NULL)
	647	return(NULL);
	648	bcopy(ptr, nptr, min(size, z->z_ChunkSize));
	649	free(ptr, type);
	650	return(nptr);
	651	}
	652
	653	char *
	654	strdup(const char str, struct malloc_type type)
	655	{
	656	int zlen; /* length inclusive of terminating NUL */
	657	char *nstr;
	658
	659	if (str == NULL)
	660	return(NULL);
	661	zlen = strlen(str) + 1;
	662	nstr = malloc(zlen, type, M_WAITOK);
	663	bcopy(str, nstr, zlen);
	664	return(nstr);
	665	}
	666
	667	#ifdef SMP
	668	/*
	669	* free() (SLAB ALLOCATOR)
	670	*
	671	* Free the specified chunk of memory.
	672	*/
	673	static
	674	void
	675	free_remote(void *ptr)
	676	{
	677	free(ptr, (struct malloc_type *)ptr);
	678	}
	679
	680	#endif
	681
	682	void
	683	free(void ptr, struct malloc_type type)
	684	{
	685	SLZone *z;
	686	SLChunk *chunk;
	687	SLGlobalData *slgd;
	688	struct globaldata *gd;
	689	int pgno;
	690
	691	gd = mycpu;
	692	slgd = &gd->gd_slab;
	693
	694	if (ptr == NULL)
	695	panic("trying to free NULL pointer");
	696
	697	/*
	698	* Handle special 0-byte allocations
	699	*/
	700	if (ptr == ZERO_LENGTH_PTR)
	701	return;
	702
	703	/*
	704	* Handle oversized allocations. XXX we really should require that a
	705	* size be passed to free() instead of this nonsense.
	706	*
	707	* This code is never called via an ipi.
	708	*/
	709	{
	710	struct kmemusage *kup;
	711	unsigned long size;
	712
	713	kup = btokup(ptr);
	714	if (kup->ku_pagecnt) {
	715	size = kup->ku_pagecnt << PAGE_SHIFT;
	716	kup->ku_pagecnt = 0;
	717	#ifdef INVARIANTS
	718	KKASSERT(sizeof(weirdary) <= size);
	719	bcopy(weirdary, ptr, sizeof(weirdary));
	720	#endif
	721	/*
	722	* note: we always adjust our cpu's slot, not the originating
	723	* cpu (kup->ku_cpuid). The statistics are in aggregate.
	724	*
	725	* note: XXX we have still inherited the interrupts-can't-block
	726	* assumption. An interrupt thread does not bump
	727	* gd_intr_nesting_level so check TDF_INTTHREAD. This is
	728	* primarily until we can fix softupdate's assumptions about free().
	729	*/
	730	crit_enter();
	731	--type->ks_inuse[gd->gd_cpuid];
	732	type->ks_memuse[gd->gd_cpuid] -= size;
	733	if (mycpu->gd_intr_nesting_level \|\| (gd->gd_curthread->td_flags & TDF_INTTHREAD)) {
	734	z = (SLZone *)ptr;
	735	z->z_Magic = ZALLOC_OVSZ_MAGIC;
	736	z->z_Next = slgd->FreeOvZones;
	737	z->z_ChunkSize = size;
	738	slgd->FreeOvZones = z;
	739	crit_exit();
	740	} else {
	741	crit_exit();
	742	kmem_slab_free(ptr, size); /* may block */
	743	}
	744	return;
	745	}
	746	}
	747
	748	/*
	749	* Zone case. Figure out the zone based on the fact that it is
	750	* ZoneSize aligned.
	751	*/
	752	z = (SLZone *)((uintptr_t)ptr & ~(uintptr_t)ZoneMask);
	753	KKASSERT(z->z_Magic == ZALLOC_SLAB_MAGIC);
	754
	755	/*
	756	* If we do not own the zone then forward the request to the
	757	* cpu that does.
	758	*/
	759	if (z->z_CpuGd != gd) {
	760	(struct malloc_type *)ptr = type;
	761	#ifdef SMP
	762	lwkt_send_ipiq(z->z_CpuGd, free_remote, ptr);
	763	#else
	764	panic("Corrupt SLZone");
	765	#endif
	766	return;
	767	}
	768
	769	if (type->ks_magic != M_MAGIC)
	770	panic("free: malloc type lacks magic");
	771
	772	crit_enter();
	773	pgno = ((char )ptr - (char )z) >> PAGE_SHIFT;
	774	chunk = ptr;
	775
	776	#ifdef INVARIANTS
	777	/*
	778	* Attempt to detect a double-free. To reduce overhead we only check
	779	* if there appears to be link pointer at the base of the data.
	780	*/
	781	if (((intptr_t)chunk->c_Next - (intptr_t)z) >> PAGE_SHIFT == pgno) {
	782	SLChunk *scan;
	783	for (scan = z->z_PageAry[pgno]; scan; scan = scan->c_Next) {
	784	if (scan == chunk)
	785	panic("Double free at %p", chunk);
	786	}
	787	}
	788	#endif
	789
	790	/*
	791	* Put weird data into the memory to detect modifications after freeing,
	792	* illegal pointer use after freeing (we should fault on the odd address),
	793	* and so forth. XXX needs more work, see the old malloc code.
	794	*/
	795	#ifdef INVARIANTS
	796	if (z->z_ChunkSize < sizeof(weirdary))
	797	bcopy(weirdary, chunk, z->z_ChunkSize);
	798	else
	799	bcopy(weirdary, chunk, sizeof(weirdary));
	800	#endif
	801
	802	/*
	803	* Add this free non-zero'd chunk to a linked list for reuse, adjust
	804	* z_FirstFreePg.
	805	*/
	806	#ifdef INVARIANTS
	807	if ((uintptr_t)chunk < VM_MIN_KERNEL_ADDRESS)
	808	panic("BADFREE %p", chunk);
	809	#endif
	810	chunk->c_Next = z->z_PageAry[pgno];
	811	z->z_PageAry[pgno] = chunk;
	812	#ifdef INVARIANTS
	813	if (chunk->c_Next && (uintptr_t)chunk->c_Next < VM_MIN_KERNEL_ADDRESS)
	814	panic("BADFREE2");
	815	#endif
	816	if (z->z_FirstFreePg > pgno)
	817	z->z_FirstFreePg = pgno;
	818
	819	/*
	820	* Bump the number of free chunks. If it becomes non-zero the zone
	821	* must be added back onto the appropriate list.
	822	*/
	823	if (z->z_NFree++ == 0) {
	824	z->z_Next = slgd->ZoneAry[z->z_ZoneIndex];
	825	slgd->ZoneAry[z->z_ZoneIndex] = z;
	826	}
	827
	828	--type->ks_inuse[z->z_Cpu];
	829	type->ks_memuse[z->z_Cpu] -= z->z_ChunkSize;
	830
	831	/*
	832	* If the zone becomes totally free, and there are other zones we
	833	* can allocate from, move this zone to the FreeZones list. Since
	834	* this code can be called from an IPI callback, do NOT try to mess
	835	* with kernel_map here. Hysteresis will be performed at malloc() time.
	836	*/
	837	if (z->z_NFree == z->z_NMax &&
	838	(z->z_Next \|\| slgd->ZoneAry[z->z_ZoneIndex] != z)
	839	) {
	840	SLZone **pz;
	841
	842	for (pz = &slgd->ZoneAry[z->z_ZoneIndex]; z != pz; pz = &(pz)->z_Next)
	843	;
	844	*pz = z->z_Next;
	845	z->z_Magic = -1;
	846	z->z_Next = slgd->FreeZones;
	847	slgd->FreeZones = z;
	848	++slgd->NFreeZones;
	849	}
	850	crit_exit();
	851	}
	852
	853	/*
	854	* kmem_slab_alloc()
	855	*
	856	* Directly allocate and wire kernel memory in PAGE_SIZE chunks with the
	857	* specified alignment. M_* flags are expected in the flags field.
	858	*
	859	* Alignment must be a multiple of PAGE_SIZE.
	860	*
	861	* NOTE! XXX For the moment we use vm_map_entry_reserve/release(),
	862	* but when we move zalloc() over to use this function as its backend
	863	* we will have to switch to kreserve/krelease and call reserve(0)
	864	* after the new space is made available.
	865	*
	866	* Interrupt code which has preempted other code is not allowed to
	867	* use PQ_CACHE pages. However, if an interrupt thread is run
	868	* non-preemptively or blocks and then runs non-preemptively, then
	869	* it is free to use PQ_CACHE pages.
	870	*/
	871	static void *
	872	kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags)
	873	{
	874	vm_size_t i;
	875	vm_offset_t addr;
	876	vm_offset_t offset;
	877	int count;
	878	thread_t td;
	879	vm_map_t map = kernel_map;
	880
	881	size = round_page(size);
	882	addr = vm_map_min(map);
	883
	884	/*
	885	* Reserve properly aligned space from kernel_map
	886	*/
	887	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	888	crit_enter();
	889	vm_map_lock(map);
	890	if (vm_map_findspace(map, vm_map_min(map), size, align, &addr)) {
	891	vm_map_unlock(map);
	892	if ((flags & M_NULLOK) == 0)
	893	panic("kmem_slab_alloc(): kernel_map ran out of space!");
	894	crit_exit();
	895	vm_map_entry_release(count);
	896	return(NULL);
	897	}
	898	offset = addr - VM_MIN_KERNEL_ADDRESS;
	899	vm_object_reference(kernel_object);
	900	vm_map_insert(map, &count,
	901	kernel_object, offset, addr, addr + size,
	902	VM_PROT_ALL, VM_PROT_ALL, 0);
	903
	904	td = curthread;
	905
	906	/*
	907	* Allocate the pages. Do not mess with the PG_ZERO flag yet.
	908	*/
	909	for (i = 0; i < size; i += PAGE_SIZE) {
	910	vm_page_t m;
	911	vm_pindex_t idx = OFF_TO_IDX(offset + i);
	912	int vmflags = 0;
	913
	914	if (flags & M_ZERO)
	915	vmflags \|= VM_ALLOC_ZERO;
	916	if (flags & M_USE_RESERVE)
	917	vmflags \|= VM_ALLOC_SYSTEM;
	918	if (flags & M_USE_INTERRUPT_RESERVE)
	919	vmflags \|= VM_ALLOC_INTERRUPT;
	920	if ((flags & (M_RNOWAIT\|M_WAITOK)) == 0)
	921	panic("kmem_slab_alloc: bad flags %08x (%p)", flags, ((int **)&size)[-1]);
	922
	923	/*
	924	* VM_ALLOC_NORMAL can only be set if we are not preempting.
	925	*
	926	* VM_ALLOC_SYSTEM is automatically set if we are preempting and
	927	* M_WAITOK was specified as an alternative (i.e. M_USE_RESERVE is
	928	* implied in this case), though I'm sure if we really need to do
	929	* that.
	930	*/
	931	if (flags & M_WAITOK) {
	932	if (td->td_preempted) {
	933	vmflags \|= VM_ALLOC_SYSTEM;
	934	} else {
	935	vmflags \|= VM_ALLOC_NORMAL;
	936	}
	937	}
	938
	939	m = vm_page_alloc(kernel_object, idx, vmflags);
	940
	941	/*
	942	* If the allocation failed we either return NULL or we retry.
	943	*
	944	* If M_WAITOK is specified we wait for more memory and retry.
	945	* If M_WAITOK is specified from a preemption we yield instead of
	946	* wait. Livelock will not occur because the interrupt thread
	947	* will not be preempting anyone the second time around after the
	948	* yield.
	949	*/
	950	if (m == NULL) {
	951	if (flags & M_WAITOK) {
	952	if (td->td_preempted) {
	953	vm_map_unlock(map);
	954	lwkt_yield();
	955	vm_map_lock(map);
	956	} else {
	957	vm_map_unlock(map);
	958	vm_wait();
	959	vm_map_lock(map);
	960	}
	961	i -= PAGE_SIZE; /* retry */
	962	continue;
	963	}
	964
	965	/*
	966	* We were unable to recover, cleanup and return NULL
	967	*/
	968	while (i != 0) {
	969	i -= PAGE_SIZE;
	970	m = vm_page_lookup(kernel_object, OFF_TO_IDX(offset + i));
	971	vm_page_free(m);
	972	}
	973	vm_map_delete(map, addr, addr + size, &count);
	974	vm_map_unlock(map);
	975	crit_exit();
	976	vm_map_entry_release(count);
	977	return(NULL);
	978	}
	979	}
	980
	981	/*
	982	* Success!
	983	*
	984	* Mark the map entry as non-pageable using a routine that allows us to
	985	* populate the underlying pages.
	986	*/
	987	vm_map_set_wired_quick(map, addr, size, &count);
	988	crit_exit();
	989
	990	/*
	991	* Enter the pages into the pmap and deal with PG_ZERO and M_ZERO.
	992	*/
	993	for (i = 0; i < size; i += PAGE_SIZE) {
	994	vm_page_t m;
	995
	996	m = vm_page_lookup(kernel_object, OFF_TO_IDX(offset + i));
	997	m->valid = VM_PAGE_BITS_ALL;
	998	vm_page_wire(m);
	999	vm_page_wakeup(m);
	1000	pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL, 1);
	1001	if ((m->flags & PG_ZERO) == 0 && (flags & M_ZERO))
	1002	bzero((char *)addr + i, PAGE_SIZE);
	1003	vm_page_flag_clear(m, PG_ZERO);
	1004	vm_page_flag_set(m, PG_MAPPED \| PG_WRITEABLE \| PG_REFERENCED);
	1005	}
	1006	vm_map_unlock(map);
	1007	vm_map_entry_release(count);
	1008	return((void *)addr);
	1009	}
	1010
	1011	static void
	1012	kmem_slab_free(void *ptr, vm_size_t size)
	1013	{
	1014	crit_enter();
	1015	vm_map_remove(kernel_map, (vm_offset_t)ptr, (vm_offset_t)ptr + size);
	1016	crit_exit();
	1017	}
	1018