Rename:
[dragonfly.git] / sys / vm / vm_page.c
CommitLineData
984263bc
MD
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91
37 * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
cde87949 38 * $DragonFly: src/sys/vm/vm_page.c,v 1.11 2003/10/02 21:00:20 hmp Exp $
984263bc
MD
39 */
40
41/*
42 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
43 * All rights reserved.
44 *
45 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
46 *
47 * Permission to use, copy, modify and distribute this software and
48 * its documentation is hereby granted, provided that both the copyright
49 * notice and this permission notice appear in all copies of the
50 * software, derivative works or modified versions, and any portions
51 * thereof, and that both notices appear in supporting documentation.
52 *
53 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
54 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
55 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
56 *
57 * Carnegie Mellon requests users of this software to return to
58 *
59 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
60 * School of Computer Science
61 * Carnegie Mellon University
62 * Pittsburgh PA 15213-3890
63 *
64 * any improvements or extensions that they make and grant Carnegie the
65 * rights to redistribute these changes.
66 */
67
68/*
69 * Resident memory management module.
70 */
71
72#include <sys/param.h>
73#include <sys/systm.h>
74#include <sys/malloc.h>
75#include <sys/proc.h>
76#include <sys/vmmeter.h>
77#include <sys/vnode.h>
78
79#include <vm/vm.h>
80#include <vm/vm_param.h>
81#include <sys/lock.h>
82#include <vm/vm_kern.h>
83#include <vm/pmap.h>
84#include <vm/vm_map.h>
85#include <vm/vm_object.h>
86#include <vm/vm_page.h>
87#include <vm/vm_pageout.h>
88#include <vm/vm_pager.h>
89#include <vm/vm_extern.h>
12e4aaff 90#include <vm/vm_page2.h>
984263bc
MD
91
92static void vm_page_queue_init (void);
93static vm_page_t vm_page_select_cache (vm_object_t, vm_pindex_t);
94
95/*
96 * Associated with page of user-allocatable memory is a
97 * page structure.
98 */
99
100static struct vm_page **vm_page_buckets; /* Array of buckets */
101static int vm_page_bucket_count; /* How big is array? */
102static int vm_page_hash_mask; /* Mask for hash function */
103static volatile int vm_page_bucket_generation;
104
105struct vpgqueues vm_page_queues[PQ_COUNT];
106
107static void
108vm_page_queue_init(void) {
109 int i;
110
111 for(i=0;i<PQ_L2_SIZE;i++) {
12e4aaff 112 vm_page_queues[PQ_FREE+i].cnt = &vmstats.v_free_count;
984263bc 113 }
12e4aaff 114 vm_page_queues[PQ_INACTIVE].cnt = &vmstats.v_inactive_count;
984263bc 115
12e4aaff
MD
116 vm_page_queues[PQ_ACTIVE].cnt = &vmstats.v_active_count;
117 vm_page_queues[PQ_HOLD].cnt = &vmstats.v_active_count;
984263bc 118 for(i=0;i<PQ_L2_SIZE;i++) {
12e4aaff 119 vm_page_queues[PQ_CACHE+i].cnt = &vmstats.v_cache_count;
984263bc
MD
120 }
121 for(i=0;i<PQ_COUNT;i++) {
122 TAILQ_INIT(&vm_page_queues[i].pl);
123 }
124}
125
126vm_page_t vm_page_array = 0;
127int vm_page_array_size = 0;
128long first_page = 0;
129int vm_page_zero_count = 0;
130
131static __inline int vm_page_hash (vm_object_t object, vm_pindex_t pindex);
132static void vm_page_free_wakeup (void);
133
134/*
135 * vm_set_page_size:
136 *
137 * Sets the page size, perhaps based upon the memory
138 * size. Must be called before any use of page-size
139 * dependent functions.
140 */
141void
142vm_set_page_size(void)
143{
12e4aaff
MD
144 if (vmstats.v_page_size == 0)
145 vmstats.v_page_size = PAGE_SIZE;
146 if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
984263bc
MD
147 panic("vm_set_page_size: page size not a power of two");
148}
149
150/*
151 * vm_add_new_page:
152 *
161399b3
MD
153 * Add a new page to the freelist for use by the system. New pages
154 * are added to both the head and tail of the associated free page
155 * queue in a bottom-up fashion, so both zero'd and non-zero'd page
156 * requests pull 'recent' adds (higher physical addresses) first.
157 *
984263bc
MD
158 * Must be called at splhigh().
159 */
160vm_page_t
161vm_add_new_page(vm_offset_t pa)
162{
163 vm_page_t m;
161399b3 164 struct vpgqueues *vpq;
984263bc 165
12e4aaff
MD
166 ++vmstats.v_page_count;
167 ++vmstats.v_free_count;
984263bc
MD
168 m = PHYS_TO_VM_PAGE(pa);
169 m->phys_addr = pa;
170 m->flags = 0;
171 m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK;
172 m->queue = m->pc + PQ_FREE;
161399b3
MD
173 vpq = &vm_page_queues[m->queue];
174 if (vpq->flipflop)
175 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
176 else
177 TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
178 vpq->flipflop = 1 - vpq->flipflop;
984263bc
MD
179 vm_page_queues[m->queue].lcnt++;
180 return (m);
181}
182
183/*
184 * vm_page_startup:
185 *
186 * Initializes the resident memory module.
187 *
188 * Allocates memory for the page cells, and
189 * for the object/offset-to-page hash table headers.
190 * Each page cell is initialized and placed on the free list.
191 */
192
193vm_offset_t
194vm_page_startup(vm_offset_t starta, vm_offset_t enda, vm_offset_t vaddr)
195{
196 vm_offset_t mapped;
197 struct vm_page **bucket;
198 vm_size_t npages, page_range;
199 vm_offset_t new_end;
200 int i;
201 vm_offset_t pa;
202 int nblocks;
203 vm_offset_t last_pa;
204
205 /* the biggest memory array is the second group of pages */
206 vm_offset_t end;
207 vm_offset_t biggestone, biggestsize;
208
209 vm_offset_t total;
210
211 total = 0;
212 biggestsize = 0;
213 biggestone = 0;
214 nblocks = 0;
215 vaddr = round_page(vaddr);
216
217 for (i = 0; phys_avail[i + 1]; i += 2) {
218 phys_avail[i] = round_page(phys_avail[i]);
219 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
220 }
221
222 for (i = 0; phys_avail[i + 1]; i += 2) {
223 int size = phys_avail[i + 1] - phys_avail[i];
224
225 if (size > biggestsize) {
226 biggestone = i;
227 biggestsize = size;
228 }
229 ++nblocks;
230 total += size;
231 }
232
233 end = phys_avail[biggestone+1];
234
235 /*
236 * Initialize the queue headers for the free queue, the active queue
237 * and the inactive queue.
238 */
239
240 vm_page_queue_init();
241
242 /*
243 * Allocate (and initialize) the hash table buckets.
244 *
245 * The number of buckets MUST BE a power of 2, and the actual value is
246 * the next power of 2 greater than the number of physical pages in
247 * the system.
248 *
249 * We make the hash table approximately 2x the number of pages to
250 * reduce the chain length. This is about the same size using the
251 * singly-linked list as the 1x hash table we were using before
252 * using TAILQ but the chain length will be smaller.
253 *
254 * Note: This computation can be tweaked if desired.
255 */
256 vm_page_buckets = (struct vm_page **)vaddr;
257 bucket = vm_page_buckets;
258 if (vm_page_bucket_count == 0) {
259 vm_page_bucket_count = 1;
260 while (vm_page_bucket_count < atop(total))
261 vm_page_bucket_count <<= 1;
262 }
263 vm_page_bucket_count <<= 1;
264 vm_page_hash_mask = vm_page_bucket_count - 1;
265
266 /*
267 * Validate these addresses.
268 */
269 new_end = end - vm_page_bucket_count * sizeof(struct vm_page *);
270 new_end = trunc_page(new_end);
271 mapped = round_page(vaddr);
272 vaddr = pmap_map(mapped, new_end, end,
273 VM_PROT_READ | VM_PROT_WRITE);
274 vaddr = round_page(vaddr);
275 bzero((caddr_t) mapped, vaddr - mapped);
276
277 for (i = 0; i < vm_page_bucket_count; i++) {
278 *bucket = NULL;
279 bucket++;
280 }
281
282 /*
283 * Compute the number of pages of memory that will be available for
284 * use (taking into account the overhead of a page structure per
285 * page).
286 */
287
288 first_page = phys_avail[0] / PAGE_SIZE;
289
290 page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page;
291 npages = (total - (page_range * sizeof(struct vm_page)) -
292 (end - new_end)) / PAGE_SIZE;
293
294 end = new_end;
295 /*
296 * Initialize the mem entry structures now, and put them in the free
297 * queue.
298 */
299 vm_page_array = (vm_page_t) vaddr;
300 mapped = vaddr;
301
302 /*
303 * Validate these addresses.
304 */
305
306 new_end = trunc_page(end - page_range * sizeof(struct vm_page));
307 mapped = pmap_map(mapped, new_end, end,
308 VM_PROT_READ | VM_PROT_WRITE);
309
310 /*
311 * Clear all of the page structures
312 */
313 bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
314 vm_page_array_size = page_range;
315
316 /*
161399b3 317 * Construct the free queue(s) in ascending order (by physical
984263bc
MD
318 * address) so that the first 16MB of physical memory is allocated
319 * last rather than first. On large-memory machines, this avoids
320 * the exhaustion of low physical memory before isa_dmainit has run.
321 */
12e4aaff
MD
322 vmstats.v_page_count = 0;
323 vmstats.v_free_count = 0;
984263bc
MD
324 for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) {
325 pa = phys_avail[i];
326 if (i == biggestone)
327 last_pa = new_end;
328 else
329 last_pa = phys_avail[i + 1];
330 while (pa < last_pa && npages-- > 0) {
331 vm_add_new_page(pa);
332 pa += PAGE_SIZE;
333 }
334 }
335 return (mapped);
336}
337
338/*
339 * vm_page_hash:
340 *
341 * Distributes the object/offset key pair among hash buckets.
342 *
343 * NOTE: This macro depends on vm_page_bucket_count being a power of 2.
344 * This routine may not block.
345 *
346 * We try to randomize the hash based on the object to spread the pages
347 * out in the hash table without it costing us too much.
348 */
349static __inline int
350vm_page_hash(vm_object_t object, vm_pindex_t pindex)
351{
352 int i = ((uintptr_t)object + pindex) ^ object->hash_rand;
353
354 return(i & vm_page_hash_mask);
355}
356
357void
358vm_page_unhold(vm_page_t mem)
359{
360 --mem->hold_count;
361 KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
362 if (mem->hold_count == 0 && mem->queue == PQ_HOLD)
363 vm_page_free_toq(mem);
364}
365
366/*
367 * vm_page_insert: [ internal use only ]
368 *
369 * Inserts the given mem entry into the object and object list.
370 *
371 * The pagetables are not updated but will presumably fault the page
372 * in if necessary, or if a kernel page the caller will at some point
373 * enter the page into the kernel's pmap. We are not allowed to block
374 * here so we *can't* do this anyway.
375 *
376 * The object and page must be locked, and must be splhigh.
377 * This routine may not block.
378 */
379
380void
381vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
382{
383 struct vm_page **bucket;
384
385 if (m->object != NULL)
386 panic("vm_page_insert: already inserted");
387
388 /*
389 * Record the object/offset pair in this page
390 */
391
392 m->object = object;
393 m->pindex = pindex;
394
395 /*
396 * Insert it into the object_object/offset hash table
397 */
398
399 bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
400 m->hnext = *bucket;
401 *bucket = m;
402 vm_page_bucket_generation++;
403
404 /*
405 * Now link into the object's list of backed pages.
406 */
407
408 TAILQ_INSERT_TAIL(&object->memq, m, listq);
409 object->generation++;
410
411 /*
412 * show that the object has one more resident page.
413 */
414
415 object->resident_page_count++;
416
417 /*
418 * Since we are inserting a new and possibly dirty page,
419 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
420 */
421 if (m->flags & PG_WRITEABLE)
422 vm_object_set_writeable_dirty(object);
423}
424
425/*
426 * vm_page_remove:
427 * NOTE: used by device pager as well -wfj
428 *
429 * Removes the given mem entry from the object/offset-page
430 * table and the object page list, but do not invalidate/terminate
431 * the backing store.
432 *
433 * The object and page must be locked, and at splhigh.
434 * The underlying pmap entry (if any) is NOT removed here.
435 * This routine may not block.
436 */
437
438void
439vm_page_remove(vm_page_t m)
440{
441 vm_object_t object;
442
443 if (m->object == NULL)
444 return;
445
446 if ((m->flags & PG_BUSY) == 0) {
447 panic("vm_page_remove: page not busy");
448 }
449
450 /*
451 * Basically destroy the page.
452 */
453
454 vm_page_wakeup(m);
455
456 object = m->object;
457
458 /*
459 * Remove from the object_object/offset hash table. The object
460 * must be on the hash queue, we will panic if it isn't
461 *
462 * Note: we must NULL-out m->hnext to prevent loops in detached
463 * buffers with vm_page_lookup().
464 */
465
466 {
467 struct vm_page **bucket;
468
469 bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)];
470 while (*bucket != m) {
471 if (*bucket == NULL)
472 panic("vm_page_remove(): page not found in hash");
473 bucket = &(*bucket)->hnext;
474 }
475 *bucket = m->hnext;
476 m->hnext = NULL;
477 vm_page_bucket_generation++;
478 }
479
480 /*
481 * Now remove from the object's list of backed pages.
482 */
483
484 TAILQ_REMOVE(&object->memq, m, listq);
485
486 /*
487 * And show that the object has one fewer resident page.
488 */
489
490 object->resident_page_count--;
491 object->generation++;
492
493 m->object = NULL;
494}
495
496/*
497 * vm_page_lookup:
498 *
499 * Returns the page associated with the object/offset
500 * pair specified; if none is found, NULL is returned.
501 *
502 * NOTE: the code below does not lock. It will operate properly if
503 * an interrupt makes a change, but the generation algorithm will not
504 * operate properly in an SMP environment where both cpu's are able to run
505 * kernel code simultaneously.
506 *
507 * The object must be locked. No side effects.
508 * This routine may not block.
509 * This is a critical path routine
510 */
511
512vm_page_t
513vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
514{
515 vm_page_t m;
516 struct vm_page **bucket;
517 int generation;
518
519 /*
520 * Search the hash table for this object/offset pair
521 */
522
523retry:
524 generation = vm_page_bucket_generation;
525 bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
526 for (m = *bucket; m != NULL; m = m->hnext) {
527 if ((m->object == object) && (m->pindex == pindex)) {
528 if (vm_page_bucket_generation != generation)
529 goto retry;
530 return (m);
531 }
532 }
533 if (vm_page_bucket_generation != generation)
534 goto retry;
535 return (NULL);
536}
537
538/*
539 * vm_page_rename:
540 *
541 * Move the given memory entry from its
542 * current object to the specified target object/offset.
543 *
544 * The object must be locked.
545 * This routine may not block.
546 *
547 * Note: this routine will raise itself to splvm(), the caller need not.
548 *
549 * Note: swap associated with the page must be invalidated by the move. We
550 * have to do this for several reasons: (1) we aren't freeing the
551 * page, (2) we are dirtying the page, (3) the VM system is probably
552 * moving the page from object A to B, and will then later move
553 * the backing store from A to B and we can't have a conflict.
554 *
555 * Note: we *always* dirty the page. It is necessary both for the
556 * fact that we moved it, and because we may be invalidating
557 * swap. If the page is on the cache, we have to deactivate it
558 * or vm_page_dirty() will panic. Dirty pages are not allowed
559 * on the cache.
560 */
561
562void
563vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
564{
565 int s;
566
567 s = splvm();
568 vm_page_remove(m);
569 vm_page_insert(m, new_object, new_pindex);
570 if (m->queue - m->pc == PQ_CACHE)
571 vm_page_deactivate(m);
572 vm_page_dirty(m);
573 splx(s);
574}
575
576/*
577 * vm_page_unqueue_nowakeup:
578 *
579 * vm_page_unqueue() without any wakeup
580 *
581 * This routine must be called at splhigh().
582 * This routine may not block.
583 */
584
585void
586vm_page_unqueue_nowakeup(vm_page_t m)
587{
588 int queue = m->queue;
589 struct vpgqueues *pq;
590 if (queue != PQ_NONE) {
591 pq = &vm_page_queues[queue];
592 m->queue = PQ_NONE;
593 TAILQ_REMOVE(&pq->pl, m, pageq);
594 (*pq->cnt)--;
595 pq->lcnt--;
596 }
597}
598
599/*
600 * vm_page_unqueue:
601 *
602 * Remove a page from its queue.
603 *
604 * This routine must be called at splhigh().
605 * This routine may not block.
606 */
607
608void
609vm_page_unqueue(vm_page_t m)
610{
611 int queue = m->queue;
612 struct vpgqueues *pq;
613 if (queue != PQ_NONE) {
614 m->queue = PQ_NONE;
615 pq = &vm_page_queues[queue];
616 TAILQ_REMOVE(&pq->pl, m, pageq);
617 (*pq->cnt)--;
618 pq->lcnt--;
619 if ((queue - m->pc) == PQ_CACHE) {
620 if (vm_paging_needed())
621 pagedaemon_wakeup();
622 }
623 }
624}
625
626#if PQ_L2_SIZE > 1
627
628/*
629 * vm_page_list_find:
630 *
631 * Find a page on the specified queue with color optimization.
632 *
633 * The page coloring optimization attempts to locate a page
634 * that does not overload other nearby pages in the object in
635 * the cpu's L1 or L2 caches. We need this optimization because
636 * cpu caches tend to be physical caches, while object spaces tend
637 * to be virtual.
638 *
639 * This routine must be called at splvm().
640 * This routine may not block.
641 *
642 * This routine may only be called from the vm_page_list_find() macro
643 * in vm_page.h
644 */
645vm_page_t
646_vm_page_list_find(int basequeue, int index)
647{
648 int i;
649 vm_page_t m = NULL;
650 struct vpgqueues *pq;
651
652 pq = &vm_page_queues[basequeue];
653
654 /*
655 * Note that for the first loop, index+i and index-i wind up at the
656 * same place. Even though this is not totally optimal, we've already
657 * blown it by missing the cache case so we do not care.
658 */
659
660 for(i = PQ_L2_SIZE / 2; i > 0; --i) {
661 if ((m = TAILQ_FIRST(&pq[(index + i) & PQ_L2_MASK].pl)) != NULL)
662 break;
663
664 if ((m = TAILQ_FIRST(&pq[(index - i) & PQ_L2_MASK].pl)) != NULL)
665 break;
666 }
667 return(m);
668}
669
670#endif
671
672/*
673 * vm_page_select_cache:
674 *
675 * Find a page on the cache queue with color optimization. As pages
676 * might be found, but not applicable, they are deactivated. This
677 * keeps us from using potentially busy cached pages.
678 *
679 * This routine must be called at splvm().
680 * This routine may not block.
681 */
682vm_page_t
683vm_page_select_cache(vm_object_t object, vm_pindex_t pindex)
684{
685 vm_page_t m;
686
687 while (TRUE) {
688 m = vm_page_list_find(
689 PQ_CACHE,
690 (pindex + object->pg_color) & PQ_L2_MASK,
691 FALSE
692 );
693 if (m && ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy ||
694 m->hold_count || m->wire_count)) {
695 vm_page_deactivate(m);
696 continue;
697 }
698 return m;
699 }
700}
701
702/*
703 * vm_page_select_free:
704 *
705 * Find a free or zero page, with specified preference. We attempt to
706 * inline the nominal case and fall back to _vm_page_select_free()
707 * otherwise.
708 *
709 * This routine must be called at splvm().
710 * This routine may not block.
711 */
712
713static __inline vm_page_t
714vm_page_select_free(vm_object_t object, vm_pindex_t pindex, boolean_t prefer_zero)
715{
716 vm_page_t m;
717
718 m = vm_page_list_find(
719 PQ_FREE,
720 (pindex + object->pg_color) & PQ_L2_MASK,
721 prefer_zero
722 );
723 return(m);
724}
725
726/*
727 * vm_page_alloc:
728 *
729 * Allocate and return a memory cell associated
730 * with this VM object/offset pair.
731 *
732 * page_req classes:
733 * VM_ALLOC_NORMAL normal process request
734 * VM_ALLOC_SYSTEM system *really* needs a page
735 * VM_ALLOC_INTERRUPT interrupt time request
736 * VM_ALLOC_ZERO zero page
737 *
738 * Object must be locked.
739 * This routine may not block.
740 *
741 * Additional special handling is required when called from an
742 * interrupt (VM_ALLOC_INTERRUPT). We are not allowed to mess with
743 * the page cache in this case.
744 */
745
746vm_page_t
747vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
748{
749 vm_page_t m = NULL;
750 int s;
751
752 KASSERT(!vm_page_lookup(object, pindex),
753 ("vm_page_alloc: page already allocated"));
754
755 /*
756 * The pager is allowed to eat deeper into the free page list.
757 */
758
bc6dffab 759 if ((curthread == pagethread) && (page_req != VM_ALLOC_INTERRUPT)) {
984263bc
MD
760 page_req = VM_ALLOC_SYSTEM;
761 };
762
763 s = splvm();
764
765loop:
12e4aaff 766 if (vmstats.v_free_count > vmstats.v_free_reserved) {
984263bc
MD
767 /*
768 * Allocate from the free queue if there are plenty of pages
769 * in it.
770 */
771 if (page_req == VM_ALLOC_ZERO)
772 m = vm_page_select_free(object, pindex, TRUE);
773 else
774 m = vm_page_select_free(object, pindex, FALSE);
775 } else if (
776 (page_req == VM_ALLOC_SYSTEM &&
12e4aaff
MD
777 vmstats.v_cache_count == 0 &&
778 vmstats.v_free_count > vmstats.v_interrupt_free_min) ||
779 (page_req == VM_ALLOC_INTERRUPT && vmstats.v_free_count > 0)
984263bc
MD
780 ) {
781 /*
782 * Interrupt or system, dig deeper into the free list.
783 */
784 m = vm_page_select_free(object, pindex, FALSE);
785 } else if (page_req != VM_ALLOC_INTERRUPT) {
786 /*
787 * Allocatable from cache (non-interrupt only). On success,
788 * we must free the page and try again, thus ensuring that
12e4aaff 789 * vmstats.v_*_free_min counters are replenished.
984263bc
MD
790 */
791 m = vm_page_select_cache(object, pindex);
792 if (m == NULL) {
793 splx(s);
794#if defined(DIAGNOSTIC)
12e4aaff
MD
795 if (vmstats.v_cache_count > 0)
796 printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", vmstats.v_cache_count);
984263bc
MD
797#endif
798 vm_pageout_deficit++;
799 pagedaemon_wakeup();
800 return (NULL);
801 }
802 KASSERT(m->dirty == 0, ("Found dirty cache page %p", m));
803 vm_page_busy(m);
804 vm_page_protect(m, VM_PROT_NONE);
805 vm_page_free(m);
806 goto loop;
807 } else {
808 /*
809 * Not allocatable from cache from interrupt, give up.
810 */
811 splx(s);
812 vm_pageout_deficit++;
813 pagedaemon_wakeup();
814 return (NULL);
815 }
816
817 /*
818 * At this point we had better have found a good page.
819 */
820
821 KASSERT(
822 m != NULL,
823 ("vm_page_alloc(): missing page on free queue\n")
824 );
825
826 /*
827 * Remove from free queue
828 */
829
830 vm_page_unqueue_nowakeup(m);
831
832 /*
833 * Initialize structure. Only the PG_ZERO flag is inherited.
834 */
835
836 if (m->flags & PG_ZERO) {
837 vm_page_zero_count--;
838 m->flags = PG_ZERO | PG_BUSY;
839 } else {
840 m->flags = PG_BUSY;
841 }
842 m->wire_count = 0;
843 m->hold_count = 0;
844 m->act_count = 0;
845 m->busy = 0;
846 m->valid = 0;
847 KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m));
848
849 /*
850 * vm_page_insert() is safe prior to the splx(). Note also that
851 * inserting a page here does not insert it into the pmap (which
852 * could cause us to block allocating memory). We cannot block
853 * anywhere.
854 */
855
856 vm_page_insert(m, object, pindex);
857
858 /*
859 * Don't wakeup too often - wakeup the pageout daemon when
860 * we would be nearly out of memory.
861 */
862 if (vm_paging_needed())
863 pagedaemon_wakeup();
864
865 splx(s);
866
867 return (m);
868}
869
870/*
871 * vm_wait: (also see VM_WAIT macro)
872 *
873 * Block until free pages are available for allocation
874 * - Called in various places before memory allocations.
875 */
876
877void
878vm_wait(void)
879{
880 int s;
881
882 s = splvm();
bc6dffab 883 if (curthread == pagethread) {
984263bc 884 vm_pageout_pages_needed = 1;
377d4740 885 tsleep(&vm_pageout_pages_needed, 0, "VMWait", 0);
984263bc
MD
886 } else {
887 if (!vm_pages_needed) {
888 vm_pages_needed = 1;
889 wakeup(&vm_pages_needed);
890 }
377d4740 891 tsleep(&vmstats.v_free_count, 0, "vmwait", 0);
984263bc
MD
892 }
893 splx(s);
894}
895
896/*
897 * vm_waitpfault: (also see VM_WAITPFAULT macro)
898 *
899 * Block until free pages are available for allocation
900 * - Called only in vm_fault so that processes page faulting
901 * can be easily tracked.
902 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing
903 * processes will be able to grab memory first. Do not change
904 * this balance without careful testing first.
905 */
906
907void
908vm_waitpfault(void)
909{
910 int s;
911
912 s = splvm();
913 if (!vm_pages_needed) {
914 vm_pages_needed = 1;
915 wakeup(&vm_pages_needed);
916 }
377d4740 917 tsleep(&vmstats.v_free_count, 0, "pfault", 0);
984263bc
MD
918 splx(s);
919}
920
984263bc
MD
921/*
922 * vm_page_activate:
923 *
924 * Put the specified page on the active list (if appropriate).
925 * Ensure that act_count is at least ACT_INIT but do not otherwise
926 * mess with it.
927 *
928 * The page queues must be locked.
929 * This routine may not block.
930 */
931void
932vm_page_activate(vm_page_t m)
933{
934 int s;
935
936 s = splvm();
937 if (m->queue != PQ_ACTIVE) {
938 if ((m->queue - m->pc) == PQ_CACHE)
12e4aaff 939 mycpu->gd_cnt.v_reactivated++;
984263bc
MD
940
941 vm_page_unqueue(m);
942
943 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
944 m->queue = PQ_ACTIVE;
945 vm_page_queues[PQ_ACTIVE].lcnt++;
946 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
947 if (m->act_count < ACT_INIT)
948 m->act_count = ACT_INIT;
12e4aaff 949 vmstats.v_active_count++;
984263bc
MD
950 }
951 } else {
952 if (m->act_count < ACT_INIT)
953 m->act_count = ACT_INIT;
954 }
955
956 splx(s);
957}
958
959/*
960 * vm_page_free_wakeup:
961 *
962 * Helper routine for vm_page_free_toq() and vm_page_cache(). This
963 * routine is called when a page has been added to the cache or free
964 * queues.
965 *
966 * This routine may not block.
967 * This routine must be called at splvm()
968 */
969static __inline void
970vm_page_free_wakeup(void)
971{
972 /*
973 * if pageout daemon needs pages, then tell it that there are
974 * some free.
975 */
976 if (vm_pageout_pages_needed &&
12e4aaff 977 vmstats.v_cache_count + vmstats.v_free_count >= vmstats.v_pageout_free_min) {
984263bc
MD
978 wakeup(&vm_pageout_pages_needed);
979 vm_pageout_pages_needed = 0;
980 }
981 /*
982 * wakeup processes that are waiting on memory if we hit a
983 * high water mark. And wakeup scheduler process if we have
984 * lots of memory. this process will swapin processes.
985 */
986 if (vm_pages_needed && !vm_page_count_min()) {
987 vm_pages_needed = 0;
12e4aaff 988 wakeup(&vmstats.v_free_count);
984263bc
MD
989 }
990}
991
992/*
993 * vm_page_free_toq:
994 *
995 * Returns the given page to the PQ_FREE list,
996 * disassociating it with any VM object.
997 *
998 * Object and page must be locked prior to entry.
999 * This routine may not block.
1000 */
1001
1002void
1003vm_page_free_toq(vm_page_t m)
1004{
1005 int s;
1006 struct vpgqueues *pq;
1007 vm_object_t object = m->object;
1008
1009 s = splvm();
1010
12e4aaff 1011 mycpu->gd_cnt.v_tfree++;
984263bc
MD
1012
1013 if (m->busy || ((m->queue - m->pc) == PQ_FREE)) {
1014 printf(
1015 "vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n",
1016 (u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0,
1017 m->hold_count);
1018 if ((m->queue - m->pc) == PQ_FREE)
1019 panic("vm_page_free: freeing free page");
1020 else
1021 panic("vm_page_free: freeing busy page");
1022 }
1023
1024 /*
1025 * unqueue, then remove page. Note that we cannot destroy
1026 * the page here because we do not want to call the pager's
1027 * callback routine until after we've put the page on the
1028 * appropriate free queue.
1029 */
1030
1031 vm_page_unqueue_nowakeup(m);
1032 vm_page_remove(m);
1033
1034 /*
1035 * If fictitious remove object association and
1036 * return, otherwise delay object association removal.
1037 */
1038
1039 if ((m->flags & PG_FICTITIOUS) != 0) {
1040 splx(s);
1041 return;
1042 }
1043
1044 m->valid = 0;
1045 vm_page_undirty(m);
1046
1047 if (m->wire_count != 0) {
1048 if (m->wire_count > 1) {
1049 panic("vm_page_free: invalid wire count (%d), pindex: 0x%lx",
1050 m->wire_count, (long)m->pindex);
1051 }
1052 panic("vm_page_free: freeing wired page\n");
1053 }
1054
1055 /*
1056 * If we've exhausted the object's resident pages we want to free
1057 * it up.
1058 */
1059
1060 if (object &&
1061 (object->type == OBJT_VNODE) &&
1062 ((object->flags & OBJ_DEAD) == 0)
1063 ) {
1064 struct vnode *vp = (struct vnode *)object->handle;
1065
1066 if (vp && VSHOULDFREE(vp))
1067 vfree(vp);
1068 }
1069
1070 /*
1071 * Clear the UNMANAGED flag when freeing an unmanaged page.
1072 */
1073
1074 if (m->flags & PG_UNMANAGED) {
1075 m->flags &= ~PG_UNMANAGED;
1076 } else {
1077#ifdef __alpha__
1078 pmap_page_is_free(m);
1079#endif
1080 }
1081
1082 if (m->hold_count != 0) {
1083 m->flags &= ~PG_ZERO;
1084 m->queue = PQ_HOLD;
1085 } else
1086 m->queue = PQ_FREE + m->pc;
1087 pq = &vm_page_queues[m->queue];
1088 pq->lcnt++;
1089 ++(*pq->cnt);
1090
1091 /*
1092 * Put zero'd pages on the end ( where we look for zero'd pages
1093 * first ) and non-zerod pages at the head.
1094 */
1095
1096 if (m->flags & PG_ZERO) {
1097 TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
1098 ++vm_page_zero_count;
1099 } else {
1100 TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1101 }
1102
1103 vm_page_free_wakeup();
1104
1105 splx(s);
1106}
1107
1108/*
1109 * vm_page_unmanage:
1110 *
1111 * Prevent PV management from being done on the page. The page is
1112 * removed from the paging queues as if it were wired, and as a
1113 * consequence of no longer being managed the pageout daemon will not
1114 * touch it (since there is no way to locate the pte mappings for the
1115 * page). madvise() calls that mess with the pmap will also no longer
1116 * operate on the page.
1117 *
1118 * Beyond that the page is still reasonably 'normal'. Freeing the page
1119 * will clear the flag.
1120 *
1121 * This routine is used by OBJT_PHYS objects - objects using unswappable
1122 * physical memory as backing store rather then swap-backed memory and
1123 * will eventually be extended to support 4MB unmanaged physical
1124 * mappings.
1125 */
1126
1127void
1128vm_page_unmanage(vm_page_t m)
1129{
1130 int s;
1131
1132 s = splvm();
1133 if ((m->flags & PG_UNMANAGED) == 0) {
1134 if (m->wire_count == 0)
1135 vm_page_unqueue(m);
1136 }
1137 vm_page_flag_set(m, PG_UNMANAGED);
1138 splx(s);
1139}
1140
1141/*
1142 * vm_page_wire:
1143 *
1144 * Mark this page as wired down by yet
1145 * another map, removing it from paging queues
1146 * as necessary.
1147 *
1148 * The page queues must be locked.
1149 * This routine may not block.
1150 */
1151void
1152vm_page_wire(vm_page_t m)
1153{
1154 int s;
1155
1156 /*
1157 * Only bump the wire statistics if the page is not already wired,
1158 * and only unqueue the page if it is on some queue (if it is unmanaged
1159 * it is already off the queues).
1160 */
1161 s = splvm();
1162 if (m->wire_count == 0) {
1163 if ((m->flags & PG_UNMANAGED) == 0)
1164 vm_page_unqueue(m);
12e4aaff 1165 vmstats.v_wire_count++;
984263bc
MD
1166 }
1167 m->wire_count++;
1168 KASSERT(m->wire_count != 0,
1169 ("vm_page_wire: wire_count overflow m=%p", m));
1170
1171 splx(s);
1172 vm_page_flag_set(m, PG_MAPPED);
1173}
1174
1175/*
1176 * vm_page_unwire:
1177 *
1178 * Release one wiring of this page, potentially
1179 * enabling it to be paged again.
1180 *
1181 * Many pages placed on the inactive queue should actually go
1182 * into the cache, but it is difficult to figure out which. What
1183 * we do instead, if the inactive target is well met, is to put
1184 * clean pages at the head of the inactive queue instead of the tail.
1185 * This will cause them to be moved to the cache more quickly and
1186 * if not actively re-referenced, freed more quickly. If we just
1187 * stick these pages at the end of the inactive queue, heavy filesystem
1188 * meta-data accesses can cause an unnecessary paging load on memory bound
1189 * processes. This optimization causes one-time-use metadata to be
1190 * reused more quickly.
1191 *
1192 * BUT, if we are in a low-memory situation we have no choice but to
1193 * put clean pages on the cache queue.
1194 *
1195 * A number of routines use vm_page_unwire() to guarantee that the page
1196 * will go into either the inactive or active queues, and will NEVER
1197 * be placed in the cache - for example, just after dirtying a page.
1198 * dirty pages in the cache are not allowed.
1199 *
1200 * The page queues must be locked.
1201 * This routine may not block.
1202 */
1203void
1204vm_page_unwire(vm_page_t m, int activate)
1205{
1206 int s;
1207
1208 s = splvm();
1209
1210 if (m->wire_count > 0) {
1211 m->wire_count--;
1212 if (m->wire_count == 0) {
12e4aaff 1213 vmstats.v_wire_count--;
984263bc
MD
1214 if (m->flags & PG_UNMANAGED) {
1215 ;
1216 } else if (activate) {
1217 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1218 m->queue = PQ_ACTIVE;
1219 vm_page_queues[PQ_ACTIVE].lcnt++;
12e4aaff 1220 vmstats.v_active_count++;
984263bc
MD
1221 } else {
1222 vm_page_flag_clear(m, PG_WINATCFLS);
1223 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1224 m->queue = PQ_INACTIVE;
1225 vm_page_queues[PQ_INACTIVE].lcnt++;
12e4aaff 1226 vmstats.v_inactive_count++;
984263bc
MD
1227 }
1228 }
1229 } else {
1230 panic("vm_page_unwire: invalid wire count: %d\n", m->wire_count);
1231 }
1232 splx(s);
1233}
1234
1235
1236/*
1237 * Move the specified page to the inactive queue. If the page has
1238 * any associated swap, the swap is deallocated.
1239 *
1240 * Normally athead is 0 resulting in LRU operation. athead is set
1241 * to 1 if we want this page to be 'as if it were placed in the cache',
1242 * except without unmapping it from the process address space.
1243 *
1244 * This routine may not block.
1245 */
1246static __inline void
1247_vm_page_deactivate(vm_page_t m, int athead)
1248{
1249 int s;
1250
1251 /*
1252 * Ignore if already inactive.
1253 */
1254 if (m->queue == PQ_INACTIVE)
1255 return;
1256
1257 s = splvm();
1258 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
1259 if ((m->queue - m->pc) == PQ_CACHE)
12e4aaff 1260 mycpu->gd_cnt.v_reactivated++;
984263bc
MD
1261 vm_page_flag_clear(m, PG_WINATCFLS);
1262 vm_page_unqueue(m);
1263 if (athead)
1264 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1265 else
1266 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1267 m->queue = PQ_INACTIVE;
1268 vm_page_queues[PQ_INACTIVE].lcnt++;
12e4aaff 1269 vmstats.v_inactive_count++;
984263bc
MD
1270 }
1271 splx(s);
1272}
1273
1274void
1275vm_page_deactivate(vm_page_t m)
1276{
1277 _vm_page_deactivate(m, 0);
1278}
1279
1280/*
1281 * vm_page_try_to_cache:
1282 *
1283 * Returns 0 on failure, 1 on success
1284 */
1285int
1286vm_page_try_to_cache(vm_page_t m)
1287{
1288 if (m->dirty || m->hold_count || m->busy || m->wire_count ||
1289 (m->flags & (PG_BUSY|PG_UNMANAGED))) {
1290 return(0);
1291 }
1292 vm_page_test_dirty(m);
1293 if (m->dirty)
1294 return(0);
1295 vm_page_cache(m);
1296 return(1);
1297}
1298
1299/*
1300 * vm_page_try_to_free()
1301 *
1302 * Attempt to free the page. If we cannot free it, we do nothing.
1303 * 1 is returned on success, 0 on failure.
1304 */
1305
1306int
1307vm_page_try_to_free(vm_page_t m)
1308{
1309 if (m->dirty || m->hold_count || m->busy || m->wire_count ||
1310 (m->flags & (PG_BUSY|PG_UNMANAGED))) {
1311 return(0);
1312 }
1313 vm_page_test_dirty(m);
1314 if (m->dirty)
1315 return(0);
1316 vm_page_busy(m);
1317 vm_page_protect(m, VM_PROT_NONE);
1318 vm_page_free(m);
1319 return(1);
1320}
1321
1322
1323/*
1324 * vm_page_cache
1325 *
1326 * Put the specified page onto the page cache queue (if appropriate).
1327 *
1328 * This routine may not block.
1329 */
1330void
1331vm_page_cache(vm_page_t m)
1332{
1333 int s;
1334
1335 if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy || m->wire_count) {
1336 printf("vm_page_cache: attempting to cache busy page\n");
1337 return;
1338 }
1339 if ((m->queue - m->pc) == PQ_CACHE)
1340 return;
1341
1342 /*
1343 * Remove all pmaps and indicate that the page is not
1344 * writeable or mapped.
1345 */
1346
1347 vm_page_protect(m, VM_PROT_NONE);
1348 if (m->dirty != 0) {
1349 panic("vm_page_cache: caching a dirty page, pindex: %ld",
1350 (long)m->pindex);
1351 }
1352 s = splvm();
1353 vm_page_unqueue_nowakeup(m);
1354 m->queue = PQ_CACHE + m->pc;
1355 vm_page_queues[m->queue].lcnt++;
1356 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
12e4aaff 1357 vmstats.v_cache_count++;
984263bc
MD
1358 vm_page_free_wakeup();
1359 splx(s);
1360}
1361
1362/*
1363 * vm_page_dontneed
1364 *
1365 * Cache, deactivate, or do nothing as appropriate. This routine
1366 * is typically used by madvise() MADV_DONTNEED.
1367 *
1368 * Generally speaking we want to move the page into the cache so
1369 * it gets reused quickly. However, this can result in a silly syndrome
1370 * due to the page recycling too quickly. Small objects will not be
1371 * fully cached. On the otherhand, if we move the page to the inactive
1372 * queue we wind up with a problem whereby very large objects
1373 * unnecessarily blow away our inactive and cache queues.
1374 *
1375 * The solution is to move the pages based on a fixed weighting. We
1376 * either leave them alone, deactivate them, or move them to the cache,
1377 * where moving them to the cache has the highest weighting.
1378 * By forcing some pages into other queues we eventually force the
1379 * system to balance the queues, potentially recovering other unrelated
1380 * space from active. The idea is to not force this to happen too
1381 * often.
1382 */
1383
1384void
1385vm_page_dontneed(vm_page_t m)
1386{
1387 static int dnweight;
1388 int dnw;
1389 int head;
1390
1391 dnw = ++dnweight;
1392
1393 /*
1394 * occassionally leave the page alone
1395 */
1396
1397 if ((dnw & 0x01F0) == 0 ||
1398 m->queue == PQ_INACTIVE ||
1399 m->queue - m->pc == PQ_CACHE
1400 ) {
1401 if (m->act_count >= ACT_INIT)
1402 --m->act_count;
1403 return;
1404 }
1405
1406 if (m->dirty == 0)
1407 vm_page_test_dirty(m);
1408
1409 if (m->dirty || (dnw & 0x0070) == 0) {
1410 /*
1411 * Deactivate the page 3 times out of 32.
1412 */
1413 head = 0;
1414 } else {
1415 /*
1416 * Cache the page 28 times out of every 32. Note that
1417 * the page is deactivated instead of cached, but placed
1418 * at the head of the queue instead of the tail.
1419 */
1420 head = 1;
1421 }
1422 _vm_page_deactivate(m, head);
1423}
1424
1425/*
1426 * Grab a page, waiting until we are waken up due to the page
1427 * changing state. We keep on waiting, if the page continues
1428 * to be in the object. If the page doesn't exist, allocate it.
1429 *
1430 * This routine may block.
1431 */
1432vm_page_t
1433vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
1434{
1435
1436 vm_page_t m;
1437 int s, generation;
1438
1439retrylookup:
1440 if ((m = vm_page_lookup(object, pindex)) != NULL) {
1441 if (m->busy || (m->flags & PG_BUSY)) {
1442 generation = object->generation;
1443
1444 s = splvm();
1445 while ((object->generation == generation) &&
1446 (m->busy || (m->flags & PG_BUSY))) {
1447 vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
377d4740 1448 tsleep(m, 0, "pgrbwt", 0);
984263bc
MD
1449 if ((allocflags & VM_ALLOC_RETRY) == 0) {
1450 splx(s);
1451 return NULL;
1452 }
1453 }
1454 splx(s);
1455 goto retrylookup;
1456 } else {
1457 vm_page_busy(m);
1458 return m;
1459 }
1460 }
1461
1462 m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY);
1463 if (m == NULL) {
1464 VM_WAIT;
1465 if ((allocflags & VM_ALLOC_RETRY) == 0)
1466 return NULL;
1467 goto retrylookup;
1468 }
1469
1470 return m;
1471}
1472
1473/*
1474 * Mapping function for valid bits or for dirty bits in
1475 * a page. May not block.
1476 *
1477 * Inputs are required to range within a page.
1478 */
1479
1480__inline int
1481vm_page_bits(int base, int size)
1482{
1483 int first_bit;
1484 int last_bit;
1485
1486 KASSERT(
1487 base + size <= PAGE_SIZE,
1488 ("vm_page_bits: illegal base/size %d/%d", base, size)
1489 );
1490
1491 if (size == 0) /* handle degenerate case */
1492 return(0);
1493
1494 first_bit = base >> DEV_BSHIFT;
1495 last_bit = (base + size - 1) >> DEV_BSHIFT;
1496
1497 return ((2 << last_bit) - (1 << first_bit));
1498}
1499
1500/*
1501 * vm_page_set_validclean:
1502 *
1503 * Sets portions of a page valid and clean. The arguments are expected
1504 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
1505 * of any partial chunks touched by the range. The invalid portion of
1506 * such chunks will be zero'd.
1507 *
1508 * This routine may not block.
1509 *
1510 * (base + size) must be less then or equal to PAGE_SIZE.
1511 */
1512void
1513vm_page_set_validclean(vm_page_t m, int base, int size)
1514{
1515 int pagebits;
1516 int frag;
1517 int endoff;
1518
1519 if (size == 0) /* handle degenerate case */
1520 return;
1521
1522 /*
1523 * If the base is not DEV_BSIZE aligned and the valid
1524 * bit is clear, we have to zero out a portion of the
1525 * first block.
1526 */
1527
1528 if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
1529 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
1530 ) {
1531 pmap_zero_page_area(
1532 VM_PAGE_TO_PHYS(m),
1533 frag,
1534 base - frag
1535 );
1536 }
1537
1538 /*
1539 * If the ending offset is not DEV_BSIZE aligned and the
1540 * valid bit is clear, we have to zero out a portion of
1541 * the last block.
1542 */
1543
1544 endoff = base + size;
1545
1546 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
1547 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
1548 ) {
1549 pmap_zero_page_area(
1550 VM_PAGE_TO_PHYS(m),
1551 endoff,
1552 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
1553 );
1554 }
1555
1556 /*
1557 * Set valid, clear dirty bits. If validating the entire
1558 * page we can safely clear the pmap modify bit. We also
1559 * use this opportunity to clear the PG_NOSYNC flag. If a process
1560 * takes a write fault on a MAP_NOSYNC memory area the flag will
1561 * be set again.
1562 *
1563 * We set valid bits inclusive of any overlap, but we can only
1564 * clear dirty bits for DEV_BSIZE chunks that are fully within
1565 * the range.
1566 */
1567
1568 pagebits = vm_page_bits(base, size);
1569 m->valid |= pagebits;
1570#if 0 /* NOT YET */
1571 if ((frag = base & (DEV_BSIZE - 1)) != 0) {
1572 frag = DEV_BSIZE - frag;
1573 base += frag;
1574 size -= frag;
1575 if (size < 0)
1576 size = 0;
1577 }
1578 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
1579#endif
1580 m->dirty &= ~pagebits;
1581 if (base == 0 && size == PAGE_SIZE) {
1582 pmap_clear_modify(m);
1583 vm_page_flag_clear(m, PG_NOSYNC);
1584 }
1585}
1586
1587#if 0
1588
1589void
1590vm_page_set_dirty(vm_page_t m, int base, int size)
1591{
1592 m->dirty |= vm_page_bits(base, size);
1593}
1594
1595#endif
1596
1597void
1598vm_page_clear_dirty(vm_page_t m, int base, int size)
1599{
1600 m->dirty &= ~vm_page_bits(base, size);
1601}
1602
1603/*
1604 * vm_page_set_invalid:
1605 *
1606 * Invalidates DEV_BSIZE'd chunks within a page. Both the
1607 * valid and dirty bits for the effected areas are cleared.
1608 *
1609 * May not block.
1610 */
1611void
1612vm_page_set_invalid(vm_page_t m, int base, int size)
1613{
1614 int bits;
1615
1616 bits = vm_page_bits(base, size);
1617 m->valid &= ~bits;
1618 m->dirty &= ~bits;
1619 m->object->generation++;
1620}
1621
1622/*
1623 * vm_page_zero_invalid()
1624 *
1625 * The kernel assumes that the invalid portions of a page contain
1626 * garbage, but such pages can be mapped into memory by user code.
1627 * When this occurs, we must zero out the non-valid portions of the
1628 * page so user code sees what it expects.
1629 *
1630 * Pages are most often semi-valid when the end of a file is mapped
1631 * into memory and the file's size is not page aligned.
1632 */
1633
1634void
1635vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
1636{
1637 int b;
1638 int i;
1639
1640 /*
1641 * Scan the valid bits looking for invalid sections that
1642 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the
1643 * valid bit may be set ) have already been zerod by
1644 * vm_page_set_validclean().
1645 */
1646
1647 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
1648 if (i == (PAGE_SIZE / DEV_BSIZE) ||
1649 (m->valid & (1 << i))
1650 ) {
1651 if (i > b) {
1652 pmap_zero_page_area(
1653 VM_PAGE_TO_PHYS(m),
1654 b << DEV_BSHIFT,
1655 (i - b) << DEV_BSHIFT
1656 );
1657 }
1658 b = i + 1;
1659 }
1660 }
1661
1662 /*
1663 * setvalid is TRUE when we can safely set the zero'd areas
1664 * as being valid. We can do this if there are no cache consistency
1665 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS.
1666 */
1667
1668 if (setvalid)
1669 m->valid = VM_PAGE_BITS_ALL;
1670}
1671
1672/*
1673 * vm_page_is_valid:
1674 *
1675 * Is (partial) page valid? Note that the case where size == 0
1676 * will return FALSE in the degenerate case where the page is
1677 * entirely invalid, and TRUE otherwise.
1678 *
1679 * May not block.
1680 */
1681
1682int
1683vm_page_is_valid(vm_page_t m, int base, int size)
1684{
1685 int bits = vm_page_bits(base, size);
1686
1687 if (m->valid && ((m->valid & bits) == bits))
1688 return 1;
1689 else
1690 return 0;
1691}
1692
1693/*
1694 * update dirty bits from pmap/mmu. May not block.
1695 */
1696
1697void
1698vm_page_test_dirty(vm_page_t m)
1699{
1700 if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
1701 vm_page_dirty(m);
1702 }
1703}
1704
1705/*
1706 * This interface is for merging with malloc() someday.
1707 * Even if we never implement compaction so that contiguous allocation
1708 * works after initialization time, malloc()'s data structures are good
1709 * for statistics and for allocations of less than a page.
1710 */
1711void *
1712contigmalloc1(
1713 unsigned long size, /* should be size_t here and for malloc() */
1714 struct malloc_type *type,
1715 int flags,
1716 unsigned long low,
1717 unsigned long high,
1718 unsigned long alignment,
1719 unsigned long boundary,
1720 vm_map_t map)
1721{
1722 int i, s, start;
1723 vm_offset_t addr, phys, tmp_addr;
1724 int pass;
1725 vm_page_t pga = vm_page_array;
a108bf71 1726 int count;
984263bc
MD
1727
1728 size = round_page(size);
1729 if (size == 0)
1730 panic("contigmalloc1: size must not be 0");
1731 if ((alignment & (alignment - 1)) != 0)
1732 panic("contigmalloc1: alignment must be a power of 2");
1733 if ((boundary & (boundary - 1)) != 0)
1734 panic("contigmalloc1: boundary must be a power of 2");
1735
1736 start = 0;
1737 for (pass = 0; pass <= 1; pass++) {
1738 s = splvm();
1739again:
1740 /*
1741 * Find first page in array that is free, within range, aligned, and
1742 * such that the boundary won't be crossed.
1743 */
12e4aaff 1744 for (i = start; i < vmstats.v_page_count; i++) {
984263bc
MD
1745 int pqtype;
1746 phys = VM_PAGE_TO_PHYS(&pga[i]);
1747 pqtype = pga[i].queue - pga[i].pc;
1748 if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) &&
1749 (phys >= low) && (phys < high) &&
1750 ((phys & (alignment - 1)) == 0) &&
1751 (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0))
1752 break;
1753 }
1754
1755 /*
1756 * If the above failed or we will exceed the upper bound, fail.
1757 */
12e4aaff 1758 if ((i == vmstats.v_page_count) ||
984263bc
MD
1759 ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) {
1760 vm_page_t m, next;
1761
1762again1:
1763 for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
1764 m != NULL;
1765 m = next) {
1766
1767 KASSERT(m->queue == PQ_INACTIVE,
1768 ("contigmalloc1: page %p is not PQ_INACTIVE", m));
1769
1770 next = TAILQ_NEXT(m, pageq);
1771 if (vm_page_sleep_busy(m, TRUE, "vpctw0"))
1772 goto again1;
1773 vm_page_test_dirty(m);
1774 if (m->dirty) {
1775 if (m->object->type == OBJT_VNODE) {
dadab5e9 1776 vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curthread);
984263bc 1777 vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC);
dadab5e9 1778 VOP_UNLOCK(m->object->handle, 0, curthread);
984263bc
MD
1779 goto again1;
1780 } else if (m->object->type == OBJT_SWAP ||
1781 m->object->type == OBJT_DEFAULT) {
1782 vm_pageout_flush(&m, 1, 0);
1783 goto again1;
1784 }
1785 }
1786 if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0))
1787 vm_page_cache(m);
1788 }
1789
1790 for (m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
1791 m != NULL;
1792 m = next) {
1793
1794 KASSERT(m->queue == PQ_ACTIVE,
1795 ("contigmalloc1: page %p is not PQ_ACTIVE", m));
1796
1797 next = TAILQ_NEXT(m, pageq);
1798 if (vm_page_sleep_busy(m, TRUE, "vpctw1"))
1799 goto again1;
1800 vm_page_test_dirty(m);
1801 if (m->dirty) {
1802 if (m->object->type == OBJT_VNODE) {
dadab5e9 1803 vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curthread);
984263bc 1804 vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC);
dadab5e9 1805 VOP_UNLOCK(m->object->handle, 0, curthread);
984263bc
MD
1806 goto again1;
1807 } else if (m->object->type == OBJT_SWAP ||
1808 m->object->type == OBJT_DEFAULT) {
1809 vm_pageout_flush(&m, 1, 0);
1810 goto again1;
1811 }
1812 }
1813 if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0))
1814 vm_page_cache(m);
1815 }
1816
1817 splx(s);
1818 continue;
1819 }
1820 start = i;
1821
1822 /*
1823 * Check successive pages for contiguous and free.
1824 */
1825 for (i = start + 1; i < (start + size / PAGE_SIZE); i++) {
1826 int pqtype;
1827 pqtype = pga[i].queue - pga[i].pc;
1828 if ((VM_PAGE_TO_PHYS(&pga[i]) !=
1829 (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) ||
1830 ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) {
1831 start++;
1832 goto again;
1833 }
1834 }
1835
1836 for (i = start; i < (start + size / PAGE_SIZE); i++) {
1837 int pqtype;
1838 vm_page_t m = &pga[i];
1839
1840 pqtype = m->queue - m->pc;
1841 if (pqtype == PQ_CACHE) {
1842 vm_page_busy(m);
1843 vm_page_free(m);
1844 }
1845 vm_page_unqueue_nowakeup(m);
1846 m->valid = VM_PAGE_BITS_ALL;
1847 if (m->flags & PG_ZERO)
1848 vm_page_zero_count--;
1849 m->flags = 0;
1850 KASSERT(m->dirty == 0, ("contigmalloc1: page %p was dirty", m));
1851 m->wire_count = 0;
1852 m->busy = 0;
1853 m->object = NULL;
1854 }
1855
1856 /*
1857 * We've found a contiguous chunk that meets are requirements.
1858 * Allocate kernel VM, unfree and assign the physical pages to it and
1859 * return kernel VM pointer.
1860 */
1861 vm_map_lock(map);
a108bf71 1862 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
e9bb90e8 1863 if (vm_map_findspace(map, vm_map_min(map), size, 1, &addr) !=
984263bc
MD
1864 KERN_SUCCESS) {
1865 /*
1866 * XXX We almost never run out of kernel virtual
1867 * space, so we don't make the allocated memory
1868 * above available.
1869 */
1870 vm_map_unlock(map);
a108bf71 1871 vm_map_entry_release(count);
984263bc
MD
1872 splx(s);
1873 return (NULL);
1874 }
1875 vm_object_reference(kernel_object);
a108bf71
MD
1876 vm_map_insert(map, &count,
1877 kernel_object, addr - VM_MIN_KERNEL_ADDRESS,
984263bc
MD
1878 addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0);
1879 vm_map_unlock(map);
a108bf71 1880 vm_map_entry_release(count);
984263bc
MD
1881
1882 tmp_addr = addr;
1883 for (i = start; i < (start + size / PAGE_SIZE); i++) {
1884 vm_page_t m = &pga[i];
1885 vm_page_insert(m, kernel_object,
1886 OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
1887 tmp_addr += PAGE_SIZE;
1888 }
cde87949 1889 vm_map_wire(map, addr, addr + size, FALSE);
984263bc
MD
1890
1891 splx(s);
1892 return ((void *)addr);
1893 }
1894 return NULL;
1895}
1896
1897void *
1898contigmalloc(
1899 unsigned long size, /* should be size_t here and for malloc() */
1900 struct malloc_type *type,
1901 int flags,
1902 unsigned long low,
1903 unsigned long high,
1904 unsigned long alignment,
1905 unsigned long boundary)
1906{
1907 return contigmalloc1(size, type, flags, low, high, alignment, boundary,
1908 kernel_map);
1909}
1910
1911void
1912contigfree(void *addr, unsigned long size, struct malloc_type *type)
1913{
1914 kmem_free(kernel_map, (vm_offset_t)addr, size);
1915}
1916
1917vm_offset_t
1918vm_page_alloc_contig(
1919 vm_offset_t size,
1920 vm_offset_t low,
1921 vm_offset_t high,
1922 vm_offset_t alignment)
1923{
1924 return ((vm_offset_t)contigmalloc1(size, M_DEVBUF, M_NOWAIT, low, high,
1925 alignment, 0ul, kernel_map));
1926}
1927
1928#include "opt_ddb.h"
1929#ifdef DDB
1930#include <sys/kernel.h>
1931
1932#include <ddb/ddb.h>
1933
1934DB_SHOW_COMMAND(page, vm_page_print_page_info)
1935{
12e4aaff
MD
1936 db_printf("vmstats.v_free_count: %d\n", vmstats.v_free_count);
1937 db_printf("vmstats.v_cache_count: %d\n", vmstats.v_cache_count);
1938 db_printf("vmstats.v_inactive_count: %d\n", vmstats.v_inactive_count);
1939 db_printf("vmstats.v_active_count: %d\n", vmstats.v_active_count);
1940 db_printf("vmstats.v_wire_count: %d\n", vmstats.v_wire_count);
1941 db_printf("vmstats.v_free_reserved: %d\n", vmstats.v_free_reserved);
1942 db_printf("vmstats.v_free_min: %d\n", vmstats.v_free_min);
1943 db_printf("vmstats.v_free_target: %d\n", vmstats.v_free_target);
1944 db_printf("vmstats.v_cache_min: %d\n", vmstats.v_cache_min);
1945 db_printf("vmstats.v_inactive_target: %d\n", vmstats.v_inactive_target);
984263bc
MD
1946}
1947
1948DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
1949{
1950 int i;
1951 db_printf("PQ_FREE:");
1952 for(i=0;i<PQ_L2_SIZE;i++) {
1953 db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
1954 }
1955 db_printf("\n");
1956
1957 db_printf("PQ_CACHE:");
1958 for(i=0;i<PQ_L2_SIZE;i++) {
1959 db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
1960 }
1961 db_printf("\n");
1962
1963 db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
1964 vm_page_queues[PQ_ACTIVE].lcnt,
1965 vm_page_queues[PQ_INACTIVE].lcnt);
1966}
1967#endif /* DDB */