sys/kern/kern_fork.c

   1 /*
   2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
  39  * $FreeBSD: src/sys/kern/kern_fork.c,v 1.72.2.13 2003/06/06 20:21:32 tegge Exp $
  40  */
  41
  42 #include "opt_ktrace.h"
  43
  44 #include <sys/param.h>
  45 #include <sys/systm.h>
  46 #include <sys/sysproto.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/kernel.h>
  49 #include <sys/sysctl.h>
  50 #include <sys/malloc.h>
  51 #include <sys/proc.h>
  52 #include <sys/resourcevar.h>
  53 #include <sys/vnode.h>
  54 #include <sys/acct.h>
  55 #include <sys/ktrace.h>
  56 #include <sys/unistd.h>
  57 #include <sys/jail.h>
  58
  59 #include <vm/vm.h>
  60 #include <sys/lock.h>
  61 #include <vm/pmap.h>
  62 #include <vm/vm_map.h>
  63 #include <vm/vm_extern.h>
  64 #include <vm/vm_zone.h>
  65
  66 #include <sys/vmmeter.h>
  67 #include <sys/user.h>
  68
  69 static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback");
  70
  71 /*
  72  * These are the stuctures used to create a callout list for things to do
  73  * when forking a process
  74  */
  75 struct forklist {
  76         forklist_fn function;
  77         TAILQ_ENTRY(forklist) next;
  78 };
  79
  80 TAILQ_HEAD(forklist_head, forklist);
  81 static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list);
  82
  83 #ifndef _SYS_SYSPROTO_H_
  84 struct fork_args {
  85         int     dummy;
  86 };
  87 #endif
  88
  89 int forksleep; /* Place for fork1() to sleep on. */
  90
  91 /* ARGSUSED */
  92 int
  93 fork(p, uap)
  94         struct proc *p;
  95         struct fork_args *uap;
  96 {
  97         int error;
  98         struct proc *p2;
  99
 100         error = fork1(p, RFFDG | RFPROC, &p2);
 101         if (error == 0) {
 102                 p->p_retval[0] = p2->p_pid;
 103                 p->p_retval[1] = 0;
 104         }
 105         return error;
 106 }
 107
 108 /* ARGSUSED */
 109 int
 110 vfork(p, uap)
 111         struct proc *p;
 112         struct vfork_args *uap;
 113 {
 114         int error;
 115         struct proc *p2;
 116
 117         error = fork1(p, RFFDG | RFPROC | RFPPWAIT | RFMEM, &p2);
 118         if (error == 0) {
 119                 p->p_retval[0] = p2->p_pid;
 120                 p->p_retval[1] = 0;
 121         }
 122         return error;
 123 }
 124
 125 int
 126 rfork(p, uap)
 127         struct proc *p;
 128         struct rfork_args *uap;
 129 {
 130         int error;
 131         struct proc *p2;
 132
 133         error = fork1(p, uap->flags, &p2);
 134         if (error == 0) {
 135                 p->p_retval[0] = p2 ? p2->p_pid : 0;
 136                 p->p_retval[1] = 0;
 137         }
 138         return error;
 139 }
 140
 141
 142 int     nprocs = 1;             /* process 0 */
 143 static int nextpid = 0;
 144
 145 /*
 146  * Random component to nextpid generation.  We mix in a random factor to make
 147  * it a little harder to predict.  We sanity check the modulus value to avoid
 148  * doing it in critical paths.  Don't let it be too small or we pointlessly
 149  * waste randomness entropy, and don't let it be impossibly large.  Using a
 150  * modulus that is too big causes a LOT more process table scans and slows
 151  * down fork processing as the pidchecked caching is defeated.
 152  */
 153 static int randompid = 0;
 154
 155 static int
 156 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
 157 {
 158                 int error, pid;
 159
 160                 pid = randompid;
 161                 error = sysctl_handle_int(oidp, &pid, 0, req);
 162                 if (error || !req->newptr)
 163                         return (error);
 164                 if (pid < 0 || pid > PID_MAX - 100)     /* out of range */
 165                         pid = PID_MAX - 100;
 166                 else if (pid < 2)                       /* NOP */
 167                         pid = 0;
 168                 else if (pid < 100)                     /* Make it reasonable */
 169                         pid = 100;
 170                 randompid = pid;
 171                 return (error);
 172 }
 173
 174 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
 175     0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
 176
 177 int
 178 fork1(p1, flags, procp)
 179         struct proc *p1;
 180         int flags;
 181         struct proc **procp;
 182 {
 183         struct proc *p2, *pptr;
 184         uid_t uid;
 185         struct proc *newproc;
 186         int ok;
 187         static int pidchecked = 0;
 188         struct forklist *ep;
 189         struct filedesc_to_leader *fdtol;
 190
 191         if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
 192                 return (EINVAL);
 193
 194         /*
 195          * Here we don't create a new process, but we divorce
 196          * certain parts of a process from itself.
 197          */
 198         if ((flags & RFPROC) == 0) {
 199
 200                 vm_fork(p1, 0, flags);
 201
 202                 /*
 203                  * Close all file descriptors.
 204                  */
 205                 if (flags & RFCFDG) {
 206                         struct filedesc *fdtmp;
 207                         fdtmp = fdinit(p1);
 208                         fdfree(p1);
 209                         p1->p_fd = fdtmp;
 210                 }
 211
 212                 /*
 213                  * Unshare file descriptors (from parent.)
 214                  */
 215                 if (flags & RFFDG) {
 216                         if (p1->p_fd->fd_refcnt > 1) {
 217                                 struct filedesc *newfd;
 218                                 newfd = fdcopy(p1);
 219                                 fdfree(p1);
 220                                 p1->p_fd = newfd;
 221                         }
 222                 }
 223                 *procp = NULL;
 224                 return (0);
 225         }
 226
 227         /*
 228          * Although process entries are dynamically created, we still keep
 229          * a global limit on the maximum number we will create.  Don't allow
 230          * a nonprivileged user to use the last ten processes; don't let root
 231          * exceed the limit. The variable nprocs is the current number of
 232          * processes, maxproc is the limit.
 233          */
 234         uid = p1->p_cred->p_ruid;
 235         if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) {
 236                 tsleep(&forksleep, PUSER, "fork", hz / 2);
 237                 return (EAGAIN);
 238         }
 239         /*
 240          * Increment the nprocs resource before blocking can occur.  There
 241          * are hard-limits as to the number of processes that can run.
 242          */
 243         nprocs++;
 244
 245         /*
 246          * Increment the count of procs running with this uid. Don't allow
 247          * a nonprivileged user to exceed their current limit.
 248          */
 249         ok = chgproccnt(p1->p_cred->p_uidinfo, 1,
 250                 (uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
 251         if (!ok) {
 252                 /*
 253                  * Back out the process count
 254                  */
 255                 nprocs--;
 256                 tsleep(&forksleep, PUSER, "fork", hz / 2);
 257                 return (EAGAIN);
 258         }
 259
 260         /* Allocate new proc. */
 261         newproc = zalloc(proc_zone);
 262
 263         /*
 264          * Setup linkage for kernel based threading
 265          */
 266         if((flags & RFTHREAD) != 0) {
 267                 newproc->p_peers = p1->p_peers;
 268                 p1->p_peers = newproc;
 269                 newproc->p_leader = p1->p_leader;
 270         } else {
 271                 newproc->p_peers = 0;
 272                 newproc->p_leader = newproc;
 273         }
 274
 275         newproc->p_wakeup = 0;
 276
 277         newproc->p_vmspace = NULL;
 278
 279         /*
 280          * Find an unused process ID.  We remember a range of unused IDs
 281          * ready to use (from nextpid+1 through pidchecked-1).
 282          */
 283         nextpid++;
 284         if (randompid)
 285                 nextpid += arc4random() % randompid;
 286 retry:
 287         /*
 288          * If the process ID prototype has wrapped around,
 289          * restart somewhat above 0, as the low-numbered procs
 290          * tend to include daemons that don't exit.
 291          */
 292         if (nextpid >= PID_MAX) {
 293                 nextpid = nextpid % PID_MAX;
 294                 if (nextpid < 100)
 295                         nextpid += 100;
 296                 pidchecked = 0;
 297         }
 298         if (nextpid >= pidchecked) {
 299                 int doingzomb = 0;
 300
 301                 pidchecked = PID_MAX;
 302                 /*
 303                  * Scan the active and zombie procs to check whether this pid
 304                  * is in use.  Remember the lowest pid that's greater
 305                  * than nextpid, so we can avoid checking for a while.
 306                  */
 307                 p2 = LIST_FIRST(&allproc);
 308 again:
 309                 for (; p2 != 0; p2 = LIST_NEXT(p2, p_list)) {
 310                         while (p2->p_pid == nextpid ||
 311                             p2->p_pgrp->pg_id == nextpid ||
 312                             p2->p_session->s_sid == nextpid) {
 313                                 nextpid++;
 314                                 if (nextpid >= pidchecked)
 315                                         goto retry;
 316                         }
 317                         if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
 318                                 pidchecked = p2->p_pid;
 319                         if (p2->p_pgrp->pg_id > nextpid &&
 320                             pidchecked > p2->p_pgrp->pg_id)
 321                                 pidchecked = p2->p_pgrp->pg_id;
 322                         if (p2->p_session->s_sid > nextpid &&
 323                             pidchecked > p2->p_session->s_sid)
 324                                 pidchecked = p2->p_session->s_sid;
 325                 }
 326                 if (!doingzomb) {
 327                         doingzomb = 1;
 328                         p2 = LIST_FIRST(&zombproc);
 329                         goto again;
 330                 }
 331         }
 332
 333         p2 = newproc;
 334         p2->p_stat = SIDL;                      /* protect against others */
 335         p2->p_pid = nextpid;
 336         LIST_INSERT_HEAD(&allproc, p2, p_list);
 337         LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
 338
 339         /*
 340          * Make a proc table entry for the new process.
 341          * Start by zeroing the section of proc that is zero-initialized,
 342          * then copy the section that is copied directly from the parent.
 343          */
 344         bzero(&p2->p_startzero,
 345             (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
 346         bcopy(&p1->p_startcopy, &p2->p_startcopy,
 347             (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
 348
 349         p2->p_aioinfo = NULL;
 350
 351         /*
 352          * Duplicate sub-structures as needed.
 353          * Increase reference counts on shared objects.
 354          * The p_stats and p_sigacts substructs are set in vm_fork.
 355          */
 356         p2->p_flag = P_INMEM;
 357         if (p1->p_flag & P_PROFIL)
 358                 startprofclock(p2);
 359         MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred),
 360             M_SUBPROC, M_WAITOK);
 361         bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred));
 362         p2->p_cred->p_refcnt = 1;
 363         crhold(p1->p_ucred);
 364         uihold(p1->p_cred->p_uidinfo);
 365
 366         if (p2->p_prison) {
 367                 p2->p_prison->pr_ref++;
 368                 p2->p_flag |= P_JAILED;
 369         }
 370
 371         if (p2->p_args)
 372                 p2->p_args->ar_ref++;
 373
 374         if (flags & RFSIGSHARE) {
 375                 p2->p_procsig = p1->p_procsig;
 376                 p2->p_procsig->ps_refcnt++;
 377                 if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
 378                         struct sigacts *newsigacts;
 379                         int s;
 380
 381                         /* Create the shared sigacts structure */
 382                         MALLOC(newsigacts, struct sigacts *,
 383                             sizeof(struct sigacts), M_SUBPROC, M_WAITOK);
 384                         s = splhigh();
 385                         /*
 386                          * Set p_sigacts to the new shared structure.
 387                          * Note that this is updating p1->p_sigacts at the
 388                          * same time, since p_sigacts is just a pointer to
 389                          * the shared p_procsig->ps_sigacts.
 390                          */
 391                         p2->p_sigacts  = newsigacts;
 392                         bcopy(&p1->p_addr->u_sigacts, p2->p_sigacts,
 393                             sizeof(*p2->p_sigacts));
 394                         *p2->p_sigacts = p1->p_addr->u_sigacts;
 395                         splx(s);
 396                 }
 397         } else {
 398                 MALLOC(p2->p_procsig, struct procsig *, sizeof(struct procsig),
 399                     M_SUBPROC, M_WAITOK);
 400                 bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig));
 401                 p2->p_procsig->ps_refcnt = 1;
 402                 p2->p_sigacts = NULL;   /* finished in vm_fork() */
 403         }
 404         if (flags & RFLINUXTHPN)
 405                 p2->p_sigparent = SIGUSR1;
 406         else
 407                 p2->p_sigparent = SIGCHLD;
 408
 409         /* bump references to the text vnode (for procfs) */
 410         p2->p_textvp = p1->p_textvp;
 411         if (p2->p_textvp)
 412                 VREF(p2->p_textvp);
 413
 414         if (flags & RFCFDG) {
 415                 p2->p_fd = fdinit(p1);
 416                 fdtol = NULL;
 417         } else if (flags & RFFDG) {
 418                 p2->p_fd = fdcopy(p1);
 419                 fdtol = NULL;
 420         } else {
 421                 p2->p_fd = fdshare(p1);
 422                 if (p1->p_fdtol == NULL)
 423                         p1->p_fdtol =
 424                                 filedesc_to_leader_alloc(NULL,
 425                                                          p1->p_leader);
 426                 if ((flags & RFTHREAD) != 0) {
 427                         /*
 428                          * Shared file descriptor table and
 429                          * shared process leaders.
 430                          */
 431                         fdtol = p1->p_fdtol;
 432                         fdtol->fdl_refcount++;
 433                 } else {
 434                         /*
 435                          * Shared file descriptor table, and
 436                          * different process leaders
 437                          */
 438                         fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
 439                                                          p2);
 440                 }
 441         }
 442         p2->p_fdtol = fdtol;
 443
 444         /*
 445          * If p_limit is still copy-on-write, bump refcnt,
 446          * otherwise get a copy that won't be modified.
 447          * (If PL_SHAREMOD is clear, the structure is shared
 448          * copy-on-write.)
 449          */
 450         if (p1->p_limit->p_lflags & PL_SHAREMOD)
 451                 p2->p_limit = limcopy(p1->p_limit);
 452         else {
 453                 p2->p_limit = p1->p_limit;
 454                 p2->p_limit->p_refcnt++;
 455         }
 456
 457         /*
 458          * Preserve some more flags in subprocess.  P_PROFIL has already
 459          * been preserved.
 460          */
 461         p2->p_flag |= p1->p_flag & (P_SUGID | P_ALTSTACK);
 462         if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
 463                 p2->p_flag |= P_CONTROLT;
 464         if (flags & RFPPWAIT)
 465                 p2->p_flag |= P_PPWAIT;
 466
 467         LIST_INSERT_AFTER(p1, p2, p_pglist);
 468
 469         /*
 470          * Attach the new process to its parent.
 471          *
 472          * If RFNOWAIT is set, the newly created process becomes a child
 473          * of init.  This effectively disassociates the child from the
 474          * parent.
 475          */
 476         if (flags & RFNOWAIT)
 477                 pptr = initproc;
 478         else
 479                 pptr = p1;
 480         p2->p_pptr = pptr;
 481         LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
 482         LIST_INIT(&p2->p_children);
 483
 484 #ifdef KTRACE
 485         /*
 486          * Copy traceflag and tracefile if enabled.  If not inherited,
 487          * these were zeroed above but we still could have a trace race
 488          * so make sure p2's p_tracep is NULL.
 489          */
 490         if ((p1->p_traceflag & KTRFAC_INHERIT) && p2->p_tracep == NULL) {
 491                 p2->p_traceflag = p1->p_traceflag;
 492                 if ((p2->p_tracep = p1->p_tracep) != NULL)
 493                         VREF(p2->p_tracep);
 494         }
 495 #endif
 496
 497         /*
 498          * set priority of child to be that of parent
 499          */
 500         p2->p_estcpu = p1->p_estcpu;
 501
 502         /*
 503          * This begins the section where we must prevent the parent
 504          * from being swapped.
 505          */
 506         PHOLD(p1);
 507
 508         /*
 509          * Finish creating the child process.  It will return via a different
 510          * execution path later.  (ie: directly into user mode)
 511          */
 512         vm_fork(p1, p2, flags);
 513
 514         if (flags == (RFFDG | RFPROC)) {
 515                 cnt.v_forks++;
 516                 cnt.v_forkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
 517         } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
 518                 cnt.v_vforks++;
 519                 cnt.v_vforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
 520         } else if (p1 == &proc0) {
 521                 cnt.v_kthreads++;
 522                 cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
 523         } else {
 524                 cnt.v_rforks++;
 525                 cnt.v_rforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
 526         }
 527
 528         /*
 529          * Both processes are set up, now check if any loadable modules want
 530          * to adjust anything.
 531          *   What if they have an error? XXX
 532          */
 533         TAILQ_FOREACH(ep, &fork_list, next) {
 534                 (*ep->function)(p1, p2, flags);
 535         }
 536
 537         /*
 538          * Make child runnable and add to run queue.
 539          */
 540         microtime(&(p2->p_stats->p_start));
 541         p2->p_acflag = AFORK;
 542         (void) splhigh();
 543         p2->p_stat = SRUN;
 544         setrunqueue(p2);
 545         (void) spl0();
 546
 547         /*
 548          * Now can be swapped.
 549          */
 550         PRELE(p1);
 551
 552         /*
 553          * tell any interested parties about the new process
 554          */
 555         KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
 556
 557         /*
 558          * Preserve synchronization semantics of vfork.  If waiting for
 559          * child to exec or exit, set P_PPWAIT on child, and sleep on our
 560          * proc (in case of exit).
 561          */
 562         while (p2->p_flag & P_PPWAIT)
 563                 tsleep(p1, PWAIT, "ppwait", 0);
 564
 565         /*
 566          * Return child proc pointer to parent.
 567          */
 568         *procp = p2;
 569         return (0);
 570 }
 571
 572 /*
 573  * The next two functionms are general routines to handle adding/deleting
 574  * items on the fork callout list.
 575  *
 576  * at_fork():
 577  * Take the arguments given and put them onto the fork callout list,
 578  * However first make sure that it's not already there.
 579  * Returns 0 on success or a standard error number.
 580  */
 581
 582 int
 583 at_fork(function)
 584         forklist_fn function;
 585 {
 586         struct forklist *ep;
 587
 588 #ifdef INVARIANTS
 589         /* let the programmer know if he's been stupid */
 590         if (rm_at_fork(function))
 591                 printf("WARNING: fork callout entry (%p) already present\n",
 592                     function);
 593 #endif
 594         ep = malloc(sizeof(*ep), M_ATFORK, M_NOWAIT);
 595         if (ep == NULL)
 596                 return (ENOMEM);
 597         ep->function = function;
 598         TAILQ_INSERT_TAIL(&fork_list, ep, next);
 599         return (0);
 600 }
 601
 602 /*
 603  * Scan the exit callout list for the given item and remove it..
 604  * Returns the number of items removed (0 or 1)
 605  */
 606
 607 int
 608 rm_at_fork(function)
 609         forklist_fn function;
 610 {
 611         struct forklist *ep;
 612
 613         TAILQ_FOREACH(ep, &fork_list, next) {
 614                 if (ep->function == function) {
 615                         TAILQ_REMOVE(&fork_list, ep, next);
 616                         free(ep, M_ATFORK);
 617                         return(1);
 618                 }
 619         }
 620         return (0);
 621 }