4 * Copyright (c) 1997, 1998 John S. Dyson
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice immediately at the beginning of the file, without modification,
12 * this list of conditions, and the following disclaimer.
13 * 2. Absolutely no warranty of function or purpose is made by the author
16 * $FreeBSD: src/sys/vm/vm_zone.c,v 1.30.2.6 2002/10/10 19:50:16 dillon Exp $
17 * $DragonFly: src/sys/vm/vm_zone.c,v 1.28 2008/01/23 17:35:48 nth Exp $
20 #include <sys/param.h>
21 #include <sys/queue.h>
22 #include <sys/systm.h>
23 #include <sys/kernel.h>
25 #include <sys/malloc.h>
26 #include <sys/sysctl.h>
27 #include <sys/vmmeter.h>
30 #include <vm/vm_object.h>
31 #include <vm/vm_page.h>
32 #include <vm/vm_map.h>
33 #include <vm/vm_kern.h>
34 #include <vm/vm_extern.h>
35 #include <vm/vm_zone.h>
37 #include <sys/spinlock2.h>
39 static MALLOC_DEFINE(M_ZONE, "ZONE", "Zone header");
41 #define ZONE_ERROR_INVALID 0
42 #define ZONE_ERROR_NOTFREE 1
43 #define ZONE_ERROR_ALREADYFREE 2
45 #define ZONE_ROUNDING 32
47 #define ZENTRY_FREE 0x12342378
49 static void *zget(vm_zone_t z);
52 * Return an item from the specified zone. This function is non-blocking for
53 * ZONE_INTERRUPT zones.
60 globaldata_t gd = mycpu;
65 zerror(ZONE_ERROR_INVALID);
68 * Avoid spinlock contention by allocating from a per-cpu queue
70 if (z->zfreecnt_pcpu[gd->gd_cpuid] > 0) {
72 if (z->zfreecnt_pcpu[gd->gd_cpuid] > 0) {
73 item = z->zitems_pcpu[gd->gd_cpuid];
76 ("zitems_pcpu unexpectedly NULL"));
77 if (((void **)item)[1] != (void *)ZENTRY_FREE)
78 zerror(ZONE_ERROR_NOTFREE);
79 ((void **)item)[1] = 0;
81 z->zitems_pcpu[gd->gd_cpuid] = ((void **) item)[0];
82 --z->zfreecnt_pcpu[gd->gd_cpuid];
91 * Per-zone spinlock for the remainder.
94 if (z->zfreecnt > z->zfreemin) {
97 KASSERT(item != NULL, ("zitems unexpectedly NULL"));
98 if (((void **)item)[1] != (void *)ZENTRY_FREE)
99 zerror(ZONE_ERROR_NOTFREE);
100 ((void **)item)[1] = 0;
102 z->zitems = ((void **)item)[0];
105 spin_unlock(&z->zlock);
107 spin_unlock(&z->zlock);
110 * PANICFAIL allows the caller to assume that the zalloc()
111 * will always succeed. If it doesn't, we panic here.
113 if (item == NULL && (z->zflags & ZONE_PANICFAIL))
114 panic("zalloc(%s) failed", z->zname);
120 * Free an item to the specified zone.
125 zfree(vm_zone_t z, void *item)
127 globaldata_t gd = mycpu;
131 * Avoid spinlock contention by freeing into a per-cpu queue
133 if ((zmax = z->zmax) != 0)
134 zmax = zmax / ncpus / 16;
138 if (z->zfreecnt_pcpu[gd->gd_cpuid] < zmax) {
140 ((void **)item)[0] = z->zitems_pcpu[gd->gd_cpuid];
142 if (((void **)item)[1] == (void *)ZENTRY_FREE)
143 zerror(ZONE_ERROR_ALREADYFREE);
144 ((void **)item)[1] = (void *)ZENTRY_FREE;
146 z->zitems_pcpu[gd->gd_cpuid] = item;
147 ++z->zfreecnt_pcpu[gd->gd_cpuid];
153 * Per-zone spinlock for the remainder.
155 spin_lock(&z->zlock);
156 ((void **)item)[0] = z->zitems;
158 if (((void **)item)[1] == (void *)ZENTRY_FREE)
159 zerror(ZONE_ERROR_ALREADYFREE);
160 ((void **)item)[1] = (void *)ZENTRY_FREE;
164 spin_unlock(&z->zlock);
168 * This file comprises a very simple zone allocator. This is used
169 * in lieu of the malloc allocator, where needed or more optimal.
171 * Note that the initial implementation of this had coloring, and
172 * absolutely no improvement (actually perf degradation) occurred.
174 * Note also that the zones are type stable. The only restriction is
175 * that the first two longwords of a data structure can be changed
176 * between allocations. Any data that must be stable between allocations
177 * must reside in areas after the first two longwords.
179 * zinitna, zinit, zbootinit are the initialization routines.
180 * zalloc, zfree, are the allocation/free routines.
183 LIST_HEAD(zlist, vm_zone) zlist = LIST_HEAD_INITIALIZER(zlist);
184 static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS);
185 static int zone_kmem_pages, zone_kern_pages, zone_kmem_kvaspace;
188 * Create a zone, but don't allocate the zone structure. If the
189 * zone had been previously created by the zone boot code, initialize
190 * various parts of the zone code.
192 * If waits are not allowed during allocation (e.g. during interrupt
193 * code), a-priori allocate the kernel virtual space, and allocate
194 * only pages when needed.
197 * z pointer to zone structure.
198 * obj pointer to VM object (opt).
200 * size size of zone entries.
201 * nentries number of zone entries allocated (only ZONE_INTERRUPT.)
202 * flags ZONE_INTERRUPT -- items can be allocated at interrupt time.
203 * zalloc number of pages allocated when memory is needed.
205 * Note that when using ZONE_INTERRUPT, the size of the zone is limited
206 * by the nentries argument. The size of the memory allocatable is
207 * unlimited if ZONE_INTERRUPT is not set.
212 zinitna(vm_zone_t z, vm_object_t obj, char *name, int size,
213 int nentries, int flags, int zalloc)
218 * Only zones created with zinit() are destroyable.
220 if (z->zflags & ZONE_DESTROYABLE)
221 panic("zinitna: can't create destroyable zone");
224 * NOTE: We can only adjust zsize if we previously did not
227 if ((z->zflags & ZONE_BOOT) == 0) {
228 z->zsize = (size + ZONE_ROUNDING - 1) & ~(ZONE_ROUNDING - 1);
229 spin_init(&z->zlock);
237 lwkt_gettoken(&vm_token);
238 LIST_INSERT_HEAD(&zlist, z, zlink);
239 lwkt_reltoken(&vm_token);
241 bzero(z->zitems_pcpu, sizeof(z->zitems_pcpu));
242 bzero(z->zfreecnt_pcpu, sizeof(z->zfreecnt_pcpu));
246 z->zkmcur = z->zkmmax = 0;
250 * If we cannot wait, allocate KVA space up front, and we will fill
251 * in pages as needed. This is particularly required when creating
252 * an allocation space for map entries in kernel_map, because we
253 * do not want to go into a recursion deadlock with
254 * vm_map_entry_reserve().
256 if (z->zflags & ZONE_INTERRUPT) {
257 totsize = round_page(z->zsize * nentries);
258 zone_kmem_kvaspace += totsize;
260 z->zkva = kmem_alloc_pageable(&kernel_map, totsize);
262 LIST_REMOVE(z, zlink);
266 z->zpagemax = totsize / PAGE_SIZE;
268 z->zobj = vm_object_allocate(OBJT_DEFAULT, z->zpagemax);
271 _vm_object_allocate(OBJT_DEFAULT, z->zpagemax, obj);
273 z->zallocflag = VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT;
276 z->zallocflag = VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM;
281 if (z->zsize > PAGE_SIZE)
284 z->zfreemin = PAGE_SIZE / z->zsize;
293 * Populate the interrrupt zone at creation time rather than
294 * on first allocation, as this is a potentially long operation.
296 if (z->zflags & ZONE_INTERRUPT) {
307 * Subroutine same as zinitna, except zone data structure is allocated
308 * automatically by malloc. This routine should normally be used, except
309 * in certain tricky startup conditions in the VM system -- then
310 * zbootinit and zinitna can be used. Zinit is the standard zone
311 * initialization call.
316 zinit(char *name, int size, int nentries, int flags, int zalloc)
320 z = (vm_zone_t) kmalloc(sizeof (struct vm_zone), M_ZONE, M_NOWAIT);
325 if (zinitna(z, NULL, name, size, nentries,
326 flags & ~ZONE_DESTROYABLE, zalloc) == 0) {
331 if (flags & ZONE_DESTROYABLE)
332 z->zflags |= ZONE_DESTROYABLE;
338 * Initialize a zone before the system is fully up. This routine should
339 * only be called before full VM startup.
341 * Called from the low level boot code only.
344 zbootinit(vm_zone_t z, char *name, int size, void *item, int nitems)
348 bzero(z->zitems_pcpu, sizeof(z->zitems_pcpu));
349 bzero(z->zfreecnt_pcpu, sizeof(z->zfreecnt_pcpu));
355 z->zflags = ZONE_BOOT;
361 spin_init(&z->zlock);
363 bzero(item, nitems * z->zsize);
365 for (i = 0; i < nitems; i++) {
366 ((void **)item)[0] = z->zitems;
368 ((void **)item)[1] = (void *)ZENTRY_FREE;
371 item = (uint8_t *)item + z->zsize;
373 z->zfreecnt = nitems;
377 lwkt_gettoken(&vm_token);
378 LIST_INSERT_HEAD(&zlist, z, zlink);
379 lwkt_reltoken(&vm_token);
383 * Release all resources owned by zone created with zinit().
388 zdestroy(vm_zone_t z)
393 panic("zdestroy: null zone");
394 if ((z->zflags & ZONE_DESTROYABLE) == 0)
395 panic("zdestroy: undestroyable zone");
397 lwkt_gettoken(&vm_token);
398 LIST_REMOVE(z, zlink);
399 lwkt_reltoken(&vm_token);
402 * Release virtual mappings, physical memory and update sysctl stats.
404 if (z->zflags & ZONE_INTERRUPT) {
406 * Pages mapped via pmap_kenter() must be removed from the
407 * kernel_pmap() before calling kmem_free() to avoid issues
408 * with kernel_pmap.pm_stats.resident_count.
410 pmap_qremove(z->zkva, z->zpagemax);
415 kmem_free(&kernel_map, z->zkva, z->zpagemax*PAGE_SIZE);
416 atomic_subtract_int(&zone_kmem_kvaspace, z->zpagemax*PAGE_SIZE);
419 * Free the backing object and physical pages.
421 vm_object_deallocate(z->zobj);
422 atomic_subtract_int(&zone_kmem_pages, z->zpagecount);
424 for (i=0; i < z->zkmcur; i++) {
425 kmem_free(&kernel_map, z->zkmvec[i],
426 z->zalloc*PAGE_SIZE);
427 atomic_subtract_int(&zone_kern_pages, z->zalloc);
429 if (z->zkmvec != NULL)
430 kfree(z->zkmvec, M_ZONE);
433 spin_uninit(&z->zlock);
439 * void *zalloc(vm_zone_t zone) --
440 * Returns an item from a specified zone. May not be called from a
441 * FAST interrupt or IPI function.
443 * void zfree(vm_zone_t zone, void *item) --
444 * Frees an item back to a specified zone. May not be called from a
445 * FAST interrupt or IPI function.
449 * Internal zone routine. Not to be called from external (non vm_zone) code.
463 panic("zget: null zone");
465 if (z->zflags & ZONE_INTERRUPT) {
467 * Interrupt zones do not mess with the kernel_map, they
468 * simply populate an existing mapping.
470 vm_object_hold(z->zobj);
471 savezpc = z->zpagecount;
472 nbytes = z->zpagecount * PAGE_SIZE;
473 nbytes -= nbytes % z->zsize;
474 item = (char *) z->zkva + nbytes;
475 for (i = 0; ((i < z->zalloc) && (z->zpagecount < z->zpagemax));
479 m = vm_page_alloc(z->zobj, z->zpagecount,
481 /* note: z might be modified due to blocking */
486 * Unbusy page so it can freed in zdestroy(). Make
487 * sure it is not on any queue and so can not be
488 * recycled under our feet.
490 KKASSERT(m->queue == PQ_NONE);
491 vm_page_flag_clear(m, PG_BUSY);
493 zkva = z->zkva + z->zpagecount * PAGE_SIZE;
494 pmap_kenter(zkva, VM_PAGE_TO_PHYS(m)); /* YYY */
495 bzero((void *)zkva, PAGE_SIZE);
496 KKASSERT(savezpc == z->zpagecount);
500 vmstats.v_wire_count++;
502 nitems = ((z->zpagecount * PAGE_SIZE) - nbytes) / z->zsize;
503 vm_object_drop(z->zobj);
504 } else if (z->zflags & ZONE_SPECIAL) {
506 * The special zone is the one used for vm_map_entry_t's.
507 * We have to avoid an infinite recursion in
508 * vm_map_entry_reserve() by using vm_map_entry_kreserve()
509 * instead. The map entries are pre-reserved by the kernel
510 * by vm_map_entry_reserve_cpu_init().
512 nbytes = z->zalloc * PAGE_SIZE;
514 item = (void *)kmem_alloc3(&kernel_map, nbytes, KM_KRESERVE);
516 /* note: z might be modified due to blocking */
518 zone_kern_pages += z->zalloc; /* not MP-safe XXX */
523 nitems = nbytes / z->zsize;
526 * Otherwise allocate KVA from the kernel_map.
528 nbytes = z->zalloc * PAGE_SIZE;
530 item = (void *)kmem_alloc3(&kernel_map, nbytes, 0);
532 /* note: z might be modified due to blocking */
534 zone_kern_pages += z->zalloc; /* not MP-safe XXX */
537 if (z->zflags & ZONE_DESTROYABLE) {
538 if (z->zkmcur == z->zkmmax) {
540 z->zkmmax==0 ? 1 : z->zkmmax*2;
541 z->zkmvec = krealloc(z->zkmvec,
542 z->zkmmax * sizeof(z->zkmvec[0]),
545 z->zkmvec[z->zkmcur++] = (vm_offset_t)item;
550 nitems = nbytes / z->zsize;
553 spin_lock(&z->zlock);
556 * Save one for immediate allocation
560 for (i = 0; i < nitems; i++) {
561 ((void **)item)[0] = z->zitems;
563 ((void **)item)[1] = (void *)ZENTRY_FREE;
566 item = (uint8_t *)item + z->zsize;
568 z->zfreecnt += nitems;
570 } else if (z->zfreecnt > 0) {
572 z->zitems = ((void **)item)[0];
574 if (((void **)item)[1] != (void *)ZENTRY_FREE)
575 zerror(ZONE_ERROR_NOTFREE);
576 ((void **) item)[1] = 0;
583 spin_unlock(&z->zlock);
586 * A special zone may have used a kernel-reserved vm_map_entry. If
587 * so we have to be sure to recover our reserve so we don't run out.
588 * We will panic if we run out.
590 if (z->zflags & ZONE_SPECIAL)
591 vm_map_entry_reserve(0);
600 sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
607 ksnprintf(tmpbuf, sizeof(tmpbuf),
608 "\nITEM SIZE LIMIT USED FREE REQUESTS\n");
609 error = SYSCTL_OUT(req, tmpbuf, strlen(tmpbuf));
613 lwkt_gettoken(&vm_token);
614 LIST_FOREACH(curzone, &zlist, zlink) {
619 len = strlen(curzone->zname);
620 if (len >= (sizeof(tmpname) - 1))
621 len = (sizeof(tmpname) - 1);
622 for(i = 0; i < sizeof(tmpname) - 1; i++)
625 memcpy(tmpname, curzone->zname, len);
628 if (curzone == LIST_FIRST(&zlist)) {
633 ksnprintf(tmpbuf + offset, sizeof(tmpbuf) - offset,
634 "%s %6.6u, %8.8u, %6.6u, %6.6u, %8.8u\n",
635 tmpname, curzone->zsize, curzone->zmax,
636 (curzone->ztotal - curzone->zfreecnt),
637 curzone->zfreecnt, curzone->znalloc);
639 len = strlen((char *)tmpbuf);
640 if (LIST_NEXT(curzone, zlink) == NULL)
643 error = SYSCTL_OUT(req, tmpbuf, len);
648 lwkt_reltoken(&vm_token);
652 #if defined(INVARIANTS)
663 case ZONE_ERROR_INVALID:
664 msg = "zone: invalid zone";
666 case ZONE_ERROR_NOTFREE:
667 msg = "zone: entry not free";
669 case ZONE_ERROR_ALREADYFREE:
670 msg = "zone: freeing free entry";
673 msg = "zone: invalid error";
680 SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD, \
681 NULL, 0, sysctl_vm_zone, "A", "Zone Info");
683 SYSCTL_INT(_vm, OID_AUTO, zone_kmem_pages,
684 CTLFLAG_RD, &zone_kmem_pages, 0, "Number of interrupt safe pages allocated by zone");
685 SYSCTL_INT(_vm, OID_AUTO, zone_kmem_kvaspace,
686 CTLFLAG_RD, &zone_kmem_kvaspace, 0, "KVA space allocated by zone");
687 SYSCTL_INT(_vm, OID_AUTO, zone_kern_pages,
688 CTLFLAG_RD, &zone_kern_pages, 0, "Number of non-interrupt safe pages allocated by zone");