Misc cleanups and CVS surgery. Move a number of header and source files
[dragonfly.git] / sys / vm / vm_glue.c
CommitLineData
984263bc
MD
1/*
2 * Copyright (c) 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94
37 *
38 *
39 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40 * All rights reserved.
41 *
42 * Permission to use, copy, modify and distribute this software and
43 * its documentation is hereby granted, provided that both the copyright
44 * notice and this permission notice appear in all copies of the
45 * software, derivative works or modified versions, and any portions
46 * thereof, and that both notices appear in supporting documentation.
47 *
48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51 *
52 * Carnegie Mellon requests users of this software to return to
53 *
54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
55 * School of Computer Science
56 * Carnegie Mellon University
57 * Pittsburgh PA 15213-3890
58 *
59 * any improvements or extensions that they make and grant Carnegie the
60 * rights to redistribute these changes.
61 *
62 * $FreeBSD: src/sys/vm/vm_glue.c,v 1.94.2.4 2003/01/13 22:51:17 dillon Exp $
88181b08 63 * $DragonFly: src/sys/vm/vm_glue.c,v 1.45 2006/11/07 17:51:24 dillon Exp $
984263bc
MD
64 */
65
66#include "opt_vm.h"
67
68#include <sys/param.h>
69#include <sys/systm.h>
70#include <sys/proc.h>
71#include <sys/resourcevar.h>
72#include <sys/buf.h>
73#include <sys/shm.h>
74#include <sys/vmmeter.h>
75#include <sys/sysctl.h>
76
77#include <sys/kernel.h>
78#include <sys/unistd.h>
79
80#include <machine/limits.h>
81
82#include <vm/vm.h>
83#include <vm/vm_param.h>
84#include <sys/lock.h>
85#include <vm/pmap.h>
86#include <vm/vm_map.h>
87#include <vm/vm_page.h>
88#include <vm/vm_pageout.h>
89#include <vm/vm_kern.h>
90#include <vm/vm_extern.h>
91
92#include <sys/user.h>
12e4aaff 93#include <vm/vm_page2.h>
cdd46d2e 94#include <sys/thread2.h>
984263bc
MD
95
96/*
97 * System initialization
98 *
99 * Note: proc0 from proc.h
100 */
101
1388df65 102static void vm_init_limits (void *);
984263bc
MD
103SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0)
104
105/*
106 * THIS MUST BE THE LAST INITIALIZATION ITEM!!!
107 *
108 * Note: run scheduling should be divorced from the vm system.
109 */
1388df65 110static void scheduler (void *);
984263bc
MD
111SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, scheduler, NULL)
112
344ad853
MD
113#ifdef INVARIANTS
114
115static int swap_debug = 0;
116SYSCTL_INT(_vm, OID_AUTO, swap_debug,
117 CTLFLAG_RW, &swap_debug, 0, "");
118
119#endif
120
121static int scheduler_notify;
984263bc 122
1388df65 123static void swapout (struct proc *);
984263bc
MD
124
125int
6cebf0fc 126kernacc(c_caddr_t addr, int len, int rw)
984263bc
MD
127{
128 boolean_t rv;
129 vm_offset_t saddr, eaddr;
130 vm_prot_t prot;
131
132 KASSERT((rw & (~VM_PROT_ALL)) == 0,
133 ("illegal ``rw'' argument to kernacc (%x)\n", rw));
ed468fd2 134
03debc4a
MD
135 /*
136 * The globaldata space is not part of the kernel_map proper,
137 * check access separately.
138 */
139 if (is_globaldata_space((vm_offset_t)addr, (vm_offset_t)(addr + len)))
140 return (TRUE);
141
142 /*
143 * Nominal kernel memory access - check access via kernel_map.
144 */
ed468fd2
MD
145 if ((vm_offset_t)addr + len > kernel_map->max_offset ||
146 (vm_offset_t)addr + len < (vm_offset_t)addr) {
147 return (FALSE);
148 }
984263bc
MD
149 prot = rw;
150 saddr = trunc_page((vm_offset_t)addr);
151 eaddr = round_page((vm_offset_t)addr + len);
152 vm_map_lock_read(kernel_map);
153 rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
154 vm_map_unlock_read(kernel_map);
155 return (rv == TRUE);
156}
157
158int
6cebf0fc 159useracc(c_caddr_t addr, int len, int rw)
984263bc
MD
160{
161 boolean_t rv;
162 vm_prot_t prot;
163 vm_map_t map;
164 vm_map_entry_t save_hint;
165
166 KASSERT((rw & (~VM_PROT_ALL)) == 0,
167 ("illegal ``rw'' argument to useracc (%x)\n", rw));
168 prot = rw;
169 /*
170 * XXX - check separately to disallow access to user area and user
171 * page tables - they are in the map.
172 *
88181b08 173 * XXX - VM_MAX_USER_ADDRESS is an end address, not a max. It was once
984263bc
MD
174 * only used (as an end address) in trap.c. Use it as an end address
175 * here too. This bogusness has spread. I just fixed where it was
176 * used as a max in vm_mmap.c.
177 */
88181b08 178 if ((vm_offset_t) addr + len > /* XXX */ VM_MAX_USER_ADDRESS
984263bc
MD
179 || (vm_offset_t) addr + len < (vm_offset_t) addr) {
180 return (FALSE);
181 }
182 map = &curproc->p_vmspace->vm_map;
183 vm_map_lock_read(map);
184 /*
185 * We save the map hint, and restore it. Useracc appears to distort
186 * the map hint unnecessarily.
187 */
188 save_hint = map->hint;
189 rv = vm_map_check_protection(map,
190 trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), prot);
191 map->hint = save_hint;
192 vm_map_unlock_read(map);
193
194 return (rv == TRUE);
195}
196
197void
57e43348 198vslock(caddr_t addr, u_int len)
984263bc 199{
fc00aa2b
MD
200 if (len) {
201 vm_map_wire(&curproc->p_vmspace->vm_map,
202 trunc_page((vm_offset_t)addr),
203 round_page((vm_offset_t)addr + len), 0);
204 }
984263bc
MD
205}
206
207void
57e43348 208vsunlock(caddr_t addr, u_int len)
984263bc 209{
fc00aa2b
MD
210 if (len) {
211 vm_map_wire(&curproc->p_vmspace->vm_map,
212 trunc_page((vm_offset_t)addr),
213 round_page((vm_offset_t)addr + len),
214 KM_PAGEABLE);
215 }
984263bc
MD
216}
217
218/*
219 * Implement fork's actions on an address space.
220 * Here we arrange for the address space to be copied or referenced,
221 * allocate a user struct (pcb and kernel stack), then call the
222 * machine-dependent layer to fill those in and make the new process
223 * ready to run. The new process is set up so that it returns directly
224 * to user mode to avoid stack copying and relocation problems.
225 */
226void
bb3cd951 227vm_fork(struct lwp *lp1, struct proc *p2, int flags)
984263bc 228{
5f910b2f 229 struct user *up;
bb3cd951 230 struct proc *p1 = lp1->lwp_proc;
7e1d4bf4 231 struct thread *td2;
984263bc
MD
232
233 if ((flags & RFPROC) == 0) {
234 /*
235 * Divorce the memory, if it is shared, essentially
236 * this changes shared memory amongst threads, into
237 * COW locally.
238 */
239 if ((flags & RFMEM) == 0) {
240 if (p1->p_vmspace->vm_refcnt > 1) {
241 vmspace_unshare(p1);
242 }
243 }
bb3cd951 244 cpu_fork(lp1, NULL, flags);
984263bc
MD
245 return;
246 }
247
248 if (flags & RFMEM) {
249 p2->p_vmspace = p1->p_vmspace;
250 p1->p_vmspace->vm_refcnt++;
251 }
252
253 while (vm_page_count_severe()) {
659c6a07 254 vm_wait();
984263bc
MD
255 }
256
257 if ((flags & RFMEM) == 0) {
258 p2->p_vmspace = vmspace_fork(p1->p_vmspace);
259
260 pmap_pinit2(vmspace_pmap(p2->p_vmspace));
261
262 if (p1->p_vmspace->vm_shm)
263 shmfork(p1, p2);
264 }
265
d3d32139 266 td2 = lwkt_alloc_thread(NULL, LWKT_THREAD_STACK, -1, 0);
7d0bac62 267 pmap_init_proc(p2, td2);
26a0694b 268 lwkt_setpri(td2, TDPRI_KERN_USER);
6ad39cae 269 lwkt_set_comm(td2, "%s", p1->p_comm);
984263bc
MD
270
271 up = p2->p_addr;
272
273 /*
274 * p_stats currently points at fields in the user struct
275 * but not at &u, instead at p_addr. Copy parts of
276 * p_stats; zero the rest of p_stats (statistics).
277 *
278 * If procsig->ps_refcnt is 1 and p2->p_sigacts is NULL we dont' need
279 * to share sigacts, so we use the up->u_sigacts.
280 */
281 p2->p_stats = &up->u_stats;
282 if (p2->p_sigacts == NULL) {
283 if (p2->p_procsig->ps_refcnt != 1)
284 printf ("PID:%d NULL sigacts with refcnt not 1!\n",p2->p_pid);
285 p2->p_sigacts = &up->u_sigacts;
286 up->u_sigacts = *p1->p_sigacts;
287 }
288
236481aa 289 bzero(&up->u_stats, sizeof(struct pstats));
984263bc
MD
290
291 /*
292 * cpu_fork will copy and update the pcb, set up the kernel stack,
293 * and make the child ready to run.
294 */
bb3cd951 295 cpu_fork(lp1, td2->td_lwp, flags);
984263bc
MD
296}
297
298/*
299 * Called after process has been wait(2)'ed apon and is being reaped.
300 * The idea is to reclaim resources that we could not reclaim while
301 * the process was still executing.
302 */
303void
304vm_waitproc(struct proc *p)
305{
8bacbd6a 306 p->p_stats = NULL;
99df837e 307 cpu_proc_wait(p);
984263bc
MD
308 vmspace_exitfree(p); /* and clean-out the vmspace */
309}
310
311/*
312 * Set default limits for VM system.
313 * Called for proc 0, and then inherited by all others.
314 *
315 * XXX should probably act directly on proc0.
316 */
317static void
57e43348 318vm_init_limits(void *udata)
984263bc 319{
5f910b2f 320 struct proc *p = udata;
984263bc
MD
321 int rss_limit;
322
323 /*
324 * Set up the initial limits on process VM. Set the maximum resident
325 * set size to be half of (reasonably) available memory. Since this
326 * is a soft limit, it comes into effect only when the system is out
327 * of memory - half of main memory helps to favor smaller processes,
328 * and reduces thrashing of the object cache.
329 */
330 p->p_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
331 p->p_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
332 p->p_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
333 p->p_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
334 /* limit the limit to no less than 2MB */
12e4aaff 335 rss_limit = max(vmstats.v_free_count, 512);
984263bc
MD
336 p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit);
337 p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY;
338}
339
344ad853
MD
340/*
341 * Faultin the specified process. Note that the process can be in any
342 * state. Just clear P_SWAPPEDOUT and call wakeup in case the process is
343 * sleeping.
344 */
984263bc 345void
57e43348 346faultin(struct proc *p)
984263bc 347{
344ad853 348 if (p->p_flag & P_SWAPPEDOUT) {
8ec60c3f 349 /*
344ad853
MD
350 * The process is waiting in the kernel to return to user
351 * mode but cannot until P_SWAPPEDOUT gets cleared.
8ec60c3f 352 */
344ad853
MD
353 crit_enter();
354 p->p_flag &= ~(P_SWAPPEDOUT | P_SWAPWAIT);
355#ifdef INVARIANTS
356 if (swap_debug)
357 printf("swapping in %d (%s)\n", p->p_pid, p->p_comm);
358#endif
359 wakeup(p);
984263bc 360
cdd46d2e 361 crit_exit();
984263bc
MD
362 }
363}
364
365/*
f8c3996b
MD
366 * Kernel initialization eventually falls through to this function,
367 * which is process 0.
368 *
984263bc
MD
369 * This swapin algorithm attempts to swap-in processes only if there
370 * is enough space for them. Of course, if a process waits for a long
371 * time, it will be swapped in anyway.
372 */
344ad853 373
8fa76237
MD
374struct scheduler_info {
375 struct proc *pp;
376 int ppri;
377};
378
379static int scheduler_callback(struct proc *p, void *data);
380
984263bc 381static void
57e43348 382scheduler(void *dummy)
984263bc 383{
8fa76237 384 struct scheduler_info info;
5f910b2f 385 struct proc *p;
984263bc 386
f8c3996b 387 KKASSERT(!IN_CRITICAL_SECT(curthread));
984263bc 388loop:
344ad853
MD
389 scheduler_notify = 0;
390 /*
391 * Don't try to swap anything in if we are low on memory.
392 */
984263bc 393 if (vm_page_count_min()) {
659c6a07 394 vm_wait();
984263bc
MD
395 goto loop;
396 }
397
344ad853
MD
398 /*
399 * Look for a good candidate to wake up
400 */
8fa76237
MD
401 info.pp = NULL;
402 info.ppri = INT_MIN;
403 allproc_scan(scheduler_callback, &info);
984263bc
MD
404
405 /*
344ad853
MD
406 * Nothing to do, back to sleep for at least 1/10 of a second. If
407 * we are woken up, immediately process the next request. If
408 * multiple requests have built up the first is processed
409 * immediately and the rest are staggered.
984263bc 410 */
8fa76237 411 if ((p = info.pp) == NULL) {
344ad853
MD
412 tsleep(&proc0, 0, "nowork", hz / 10);
413 if (scheduler_notify == 0)
414 tsleep(&scheduler_notify, 0, "nowork", 0);
984263bc
MD
415 goto loop;
416 }
984263bc
MD
417
418 /*
344ad853
MD
419 * Fault the selected process in, then wait for a short period of
420 * time and loop up.
421 *
422 * XXX we need a heuristic to get a measure of system stress and
423 * then adjust our stagger wakeup delay accordingly.
984263bc
MD
424 */
425 faultin(p);
426 p->p_swtime = 0;
8fa76237 427 PRELE(p);
344ad853 428 tsleep(&proc0, 0, "swapin", hz / 10);
984263bc
MD
429 goto loop;
430}
431
8fa76237
MD
432static int
433scheduler_callback(struct proc *p, void *data)
434{
435 struct scheduler_info *info = data;
436 segsz_t pgs;
437 int pri;
438
439 if (p->p_flag & P_SWAPWAIT) {
440 pri = p->p_swtime + p->p_slptime - p->p_nice * 8;
441
442 /*
443 * The more pages paged out while we were swapped,
444 * the more work we have to do to get up and running
445 * again and the lower our wakeup priority.
446 *
447 * Each second of sleep time is worth ~1MB
448 */
449 pgs = vmspace_resident_count(p->p_vmspace);
450 if (pgs < p->p_vmspace->vm_swrss) {
451 pri -= (p->p_vmspace->vm_swrss - pgs) /
452 (1024 * 1024 / PAGE_SIZE);
453 }
454
455 /*
456 * If this process is higher priority and there is
457 * enough space, then select this process instead of
458 * the previous selection.
459 */
460 if (pri > info->ppri) {
461 if (info->pp)
462 PRELE(info->pp);
463 PHOLD(p);
464 info->pp = p;
465 info->ppri = pri;
466 }
467 }
468 return(0);
469}
470
344ad853
MD
471void
472swapin_request(void)
473{
474 if (scheduler_notify == 0) {
475 scheduler_notify = 1;
476 wakeup(&scheduler_notify);
477 }
478}
479
984263bc
MD
480#ifndef NO_SWAPPING
481
482#define swappable(p) \
483 (((p)->p_lock == 0) && \
344ad853 484 ((p)->p_flag & (P_TRACED|P_SYSTEM|P_SWAPPEDOUT|P_WEXIT)) == 0)
984263bc
MD
485
486
487/*
488 * Swap_idle_threshold1 is the guaranteed swapped in time for a process
489 */
46311ac2 490static int swap_idle_threshold1 = 15;
984263bc
MD
491SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1,
492 CTLFLAG_RW, &swap_idle_threshold1, 0, "");
493
494/*
495 * Swap_idle_threshold2 is the time that a process can be idle before
344ad853
MD
496 * it will be swapped out, if idle swapping is enabled. Default is
497 * one minute.
984263bc 498 */
344ad853 499static int swap_idle_threshold2 = 60;
984263bc
MD
500SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2,
501 CTLFLAG_RW, &swap_idle_threshold2, 0, "");
502
503/*
504 * Swapout is driven by the pageout daemon. Very simple, we find eligible
344ad853
MD
505 * procs and mark them as being swapped out. This will cause the kernel
506 * to prefer to pageout those proc's pages first and the procs in question
507 * will not return to user mode until the swapper tells them they can.
508 *
984263bc
MD
509 * If any procs have been sleeping/stopped for at least maxslp seconds,
510 * they are swapped. Else, we swap the longest-sleeping or stopped process,
511 * if any, otherwise the longest-resident process.
512 */
8fa76237
MD
513
514static int swapout_procs_callback(struct proc *p, void *data);
515
984263bc 516void
57e43348 517swapout_procs(int action)
984263bc 518{
8fa76237
MD
519 allproc_scan(swapout_procs_callback, &action);
520}
984263bc 521
8fa76237
MD
522static int
523swapout_procs_callback(struct proc *p, void *data)
524{
525 struct vmspace *vm;
526 int action = *(int *)data;
527
528 if (!swappable(p))
529 return(0);
530
531 vm = p->p_vmspace;
532
533 if (p->p_stat == SSLEEP || p->p_stat == SRUN) {
534 /*
535 * do not swap out a realtime process
536 */
537 if (RTP_PRIO_IS_REALTIME(p->p_lwp.lwp_rtprio.type))
538 return(0);
984263bc 539
8fa76237
MD
540 /*
541 * Guarentee swap_idle_threshold time in memory
542 */
543 if (p->p_slptime < swap_idle_threshold1)
544 return(0);
545
546 /*
547 * If the system is under memory stress, or if we
548 * are swapping idle processes >= swap_idle_threshold2,
549 * then swap the process out.
550 */
551 if (((action & VM_SWAP_NORMAL) == 0) &&
552 (((action & VM_SWAP_IDLE) == 0) ||
553 (p->p_slptime < swap_idle_threshold2))) {
554 return(0);
984263bc 555 }
8fa76237
MD
556
557 ++vm->vm_refcnt;
558
559 /*
560 * If the process has been asleep for awhile, swap
561 * it out.
562 */
563 if ((action & VM_SWAP_NORMAL) ||
564 ((action & VM_SWAP_IDLE) &&
565 (p->p_slptime > swap_idle_threshold2))) {
566 swapout(p);
567 }
568
569 /*
570 * cleanup our reference
571 */
572 vmspace_free(vm);
984263bc 573 }
8fa76237 574 return(0);
984263bc
MD
575}
576
577static void
57e43348 578swapout(struct proc *p)
984263bc 579{
344ad853
MD
580#ifdef INVARIANTS
581 if (swap_debug)
582 printf("swapping out %d (%s)\n", p->p_pid, p->p_comm);
984263bc
MD
583#endif
584 ++p->p_stats->p_ru.ru_nswap;
585 /*
586 * remember the process resident count
587 */
588 p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
344ad853 589 p->p_flag |= P_SWAPPEDOUT;
984263bc
MD
590 p->p_swtime = 0;
591}
344ad853 592
984263bc 593#endif /* !NO_SWAPPING */
344ad853 594