sys/kern/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)vfs_syscalls.c      8.13 (Berkeley) 4/15/94
  39  * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $
  40  * $DragonFly: src/sys/kern/vfs_syscalls.c,v 1.76 2006/01/04 18:11:26 dillon Exp $
  41  */
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/buf.h>
  46 #include <sys/conf.h>
  47 #include <sys/sysent.h>
  48 #include <sys/malloc.h>
  49 #include <sys/mount.h>
  50 #include <sys/mountctl.h>
  51 #include <sys/sysproto.h>
  52 #include <sys/filedesc.h>
  53 #include <sys/kernel.h>
  54 #include <sys/fcntl.h>
  55 #include <sys/file.h>
  56 #include <sys/linker.h>
  57 #include <sys/stat.h>
  58 #include <sys/unistd.h>
  59 #include <sys/vnode.h>
  60 #include <sys/proc.h>
  61 #include <sys/namei.h>
  62 #include <sys/nlookup.h>
  63 #include <sys/dirent.h>
  64 #include <sys/extattr.h>
  65 #include <sys/kern_syscall.h>
  66
  67 #include <machine/limits.h>
  68 #include <vfs/union/union.h>
  69 #include <sys/sysctl.h>
  70 #include <vm/vm.h>
  71 #include <vm/vm_object.h>
  72 #include <vm/vm_zone.h>
  73 #include <vm/vm_page.h>
  74
  75 #include <sys/file2.h>
  76
  77 static int checkvp_chdir (struct vnode *vn, struct thread *td);
  78 static void checkdirs (struct vnode *olddp, struct namecache *ncp);
  79 static int chroot_refuse_vdir_fds (struct filedesc *fdp);
  80 static int chroot_visible_mnt(struct mount *mp, struct proc *p);
  81 static int getutimes (const struct timeval *, struct timespec *);
  82 static int setfown (struct vnode *, uid_t, gid_t);
  83 static int setfmode (struct vnode *, int);
  84 static int setfflags (struct vnode *, int);
  85 static int setutimes (struct vnode *, const struct timespec *, int);
  86 static int      usermount = 0;  /* if 1, non-root can mount fs. */
  87
  88 int (*union_dircheckp) (struct thread *, struct vnode **, struct file *);
  89
  90 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
  91
  92 /*
  93  * Virtual File System System Calls
  94  */
  95
  96 /*
  97  * Mount a file system.
  98  */
  99 /*
 100  * mount_args(char *type, char *path, int flags, caddr_t data)
 101  */
 102 /* ARGSUSED */
 103 int
 104 mount(struct mount_args *uap)
 105 {
 106         struct thread *td = curthread;
 107         struct proc *p = td->td_proc;
 108         struct vnode *vp;
 109         struct namecache *ncp;
 110         struct mount *mp;
 111         struct vfsconf *vfsp;
 112         int error, flag = 0, flag2 = 0;
 113         struct vattr va;
 114         struct nlookupdata nd;
 115         char fstypename[MFSNAMELEN];
 116         struct nlcomponent nlc;
 117
 118         KKASSERT(p);
 119         if (p->p_ucred->cr_prison != NULL)
 120                 return (EPERM);
 121         if (usermount == 0 && (error = suser(td)))
 122                 return (error);
 123         /*
 124          * Do not allow NFS export by non-root users.
 125          */
 126         if (uap->flags & MNT_EXPORTED) {
 127                 error = suser(td);
 128                 if (error)
 129                         return (error);
 130         }
 131         /*
 132          * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
 133          */
 134         if (suser(td))
 135                 uap->flags |= MNT_NOSUID | MNT_NODEV;
 136
 137         /*
 138          * Lookup the requested path and extract the ncp and vnode.
 139          */
 140         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
 141         if (error == 0) {
 142                 if ((error = nlookup(&nd)) == 0) {
 143                         if (nd.nl_ncp->nc_vp == NULL)
 144                                 error = ENOENT;
 145                 }
 146         }
 147         if (error) {
 148                 nlookup_done(&nd);
 149                 return (error);
 150         }
 151
 152         /*
 153          * Extract the locked+refd ncp and cleanup the nd structure
 154          */
 155         ncp = nd.nl_ncp;
 156         nd.nl_ncp = NULL;
 157         nlookup_done(&nd);
 158
 159         /*
 160          * now we have the locked ref'd ncp and unreferenced vnode.
 161          */
 162         vp = ncp->nc_vp;
 163         if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) {
 164                 cache_put(ncp);
 165                 return (error);
 166         }
 167         cache_unlock(ncp);
 168
 169         /*
 170          * Now we have an unlocked ref'd ncp and a locked ref'd vp
 171          */
 172         if (uap->flags & MNT_UPDATE) {
 173                 if ((vp->v_flag & VROOT) == 0) {
 174                         cache_drop(ncp);
 175                         vput(vp);
 176                         return (EINVAL);
 177                 }
 178                 mp = vp->v_mount;
 179                 flag = mp->mnt_flag;
 180                 flag2 = mp->mnt_kern_flag;
 181                 /*
 182                  * We only allow the filesystem to be reloaded if it
 183                  * is currently mounted read-only.
 184                  */
 185                 if ((uap->flags & MNT_RELOAD) &&
 186                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 187                         cache_drop(ncp);
 188                         vput(vp);
 189                         return (EOPNOTSUPP);    /* Needs translation */
 190                 }
 191                 /*
 192                  * Only root, or the user that did the original mount is
 193                  * permitted to update it.
 194                  */
 195                 if (mp->mnt_stat.f_owner != p->p_ucred->cr_uid &&
 196                     (error = suser(td))) {
 197                         cache_drop(ncp);
 198                         vput(vp);
 199                         return (error);
 200                 }
 201                 if (vfs_busy(mp, LK_NOWAIT, td)) {
 202                         cache_drop(ncp);
 203                         vput(vp);
 204                         return (EBUSY);
 205                 }
 206                 if ((vp->v_flag & VMOUNT) != 0 ||
 207                     vp->v_mountedhere != NULL) {
 208                         cache_drop(ncp);
 209                         vfs_unbusy(mp, td);
 210                         vput(vp);
 211                         return (EBUSY);
 212                 }
 213                 vp->v_flag |= VMOUNT;
 214                 mp->mnt_flag |=
 215                     uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 216                 VOP_UNLOCK(vp, 0, td);
 217                 goto update;
 218         }
 219         /*
 220          * If the user is not root, ensure that they own the directory
 221          * onto which we are attempting to mount.
 222          */
 223         if ((error = VOP_GETATTR(vp, &va, td)) ||
 224             (va.va_uid != p->p_ucred->cr_uid &&
 225              (error = suser(td)))) {
 226                 cache_drop(ncp);
 227                 vput(vp);
 228                 return (error);
 229         }
 230         if ((error = vinvalbuf(vp, V_SAVE, td, 0, 0)) != 0) {
 231                 cache_drop(ncp);
 232                 vput(vp);
 233                 return (error);
 234         }
 235         if (vp->v_type != VDIR) {
 236                 cache_drop(ncp);
 237                 vput(vp);
 238                 return (ENOTDIR);
 239         }
 240         if ((error = copyinstr(uap->type, fstypename, MFSNAMELEN, NULL)) != 0) {
 241                 cache_drop(ncp);
 242                 vput(vp);
 243                 return (error);
 244         }
 245         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 246                 if (!strcmp(vfsp->vfc_name, fstypename))
 247                         break;
 248         }
 249         if (vfsp == NULL) {
 250                 linker_file_t lf;
 251
 252                 /* Only load modules for root (very important!) */
 253                 if ((error = suser(td)) != 0) {
 254                         cache_drop(ncp);
 255                         vput(vp);
 256                         return error;
 257                 }
 258                 error = linker_load_file(fstypename, &lf);
 259                 if (error || lf == NULL) {
 260                         cache_drop(ncp);
 261                         vput(vp);
 262                         if (lf == NULL)
 263                                 error = ENODEV;
 264                         return error;
 265                 }
 266                 lf->userrefs++;
 267                 /* lookup again, see if the VFS was loaded */
 268                 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 269                         if (!strcmp(vfsp->vfc_name, fstypename))
 270                                 break;
 271                 }
 272                 if (vfsp == NULL) {
 273                         lf->userrefs--;
 274                         linker_file_unload(lf);
 275                         cache_drop(ncp);
 276                         vput(vp);
 277                         return (ENODEV);
 278                 }
 279         }
 280         if ((vp->v_flag & VMOUNT) != 0 ||
 281             vp->v_mountedhere != NULL) {
 282                 cache_drop(ncp);
 283                 vput(vp);
 284                 return (EBUSY);
 285         }
 286         vp->v_flag |= VMOUNT;
 287
 288         /*
 289          * Allocate and initialize the filesystem.
 290          */
 291         mp = malloc(sizeof(struct mount), M_MOUNT, M_ZERO|M_WAITOK);
 292         TAILQ_INIT(&mp->mnt_nvnodelist);
 293         TAILQ_INIT(&mp->mnt_reservedvnlist);
 294         TAILQ_INIT(&mp->mnt_jlist);
 295         mp->mnt_nvnodelistsize = 0;
 296         lockinit(&mp->mnt_lock, 0, "vfslock", 0, LK_NOPAUSE);
 297         vfs_busy(mp, LK_NOWAIT, td);
 298         mp->mnt_op = vfsp->vfc_vfsops;
 299         mp->mnt_vfc = vfsp;
 300         vfsp->vfc_refcount++;
 301         mp->mnt_stat.f_type = vfsp->vfc_typenum;
 302         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 303         strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 304         mp->mnt_vnodecovered = vp;
 305         mp->mnt_stat.f_owner = p->p_ucred->cr_uid;
 306         mp->mnt_iosize_max = DFLTPHYS;
 307         VOP_UNLOCK(vp, 0, td);
 308 update:
 309         /*
 310          * Set the mount level flags.
 311          */
 312         if (uap->flags & MNT_RDONLY)
 313                 mp->mnt_flag |= MNT_RDONLY;
 314         else if (mp->mnt_flag & MNT_RDONLY)
 315                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 316         mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 317             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME |
 318             MNT_NOSYMFOLLOW | MNT_IGNORE |
 319             MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
 320         mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC |
 321             MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
 322             MNT_NOSYMFOLLOW | MNT_IGNORE |
 323             MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
 324         /*
 325          * Mount the filesystem.
 326          * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 327          * get.
 328          */
 329         error = VFS_MOUNT(mp, uap->path, uap->data, td);
 330         if (mp->mnt_flag & MNT_UPDATE) {
 331                 if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 332                         mp->mnt_flag &= ~MNT_RDONLY;
 333                 mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
 334                 mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 335                 if (error) {
 336                         mp->mnt_flag = flag;
 337                         mp->mnt_kern_flag = flag2;
 338                 }
 339                 vfs_unbusy(mp, td);
 340                 vp->v_flag &= ~VMOUNT;
 341                 vrele(vp);
 342                 cache_drop(ncp);
 343                 return (error);
 344         }
 345         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 346         /*
 347          * Put the new filesystem on the mount list after root.  The mount
 348          * point gets its own mnt_ncp which is a special ncp linking the
 349          * vnode-under to the root of the new mount.  The lookup code
 350          * detects the mount point going forward and detects the special
 351          * mnt_ncp via NCP_MOUNTPT going backwards.
 352          *
 353          * It is not necessary to invalidate or purge the vnode underneath
 354          * because elements under the mount will be given their own glue
 355          * namecache record.
 356          */
 357         if (!error) {
 358                 nlc.nlc_nameptr = "";
 359                 nlc.nlc_namelen = 0;
 360                 mp->mnt_ncp = cache_nlookup(ncp, &nlc);
 361                 cache_setunresolved(mp->mnt_ncp);
 362                 mp->mnt_ncp->nc_flag |= NCF_MOUNTPT;
 363                 mp->mnt_ncp->nc_mount = mp;
 364                 cache_drop(ncp);
 365                 /* XXX get the root of the fs and cache_setvp(mnt_ncp...) */
 366                 vp->v_flag &= ~VMOUNT;
 367                 vp->v_mountedhere = mp;
 368                 mountlist_insert(mp, MNTINS_LAST);
 369                 checkdirs(vp, mp->mnt_ncp);
 370                 cache_unlock(mp->mnt_ncp);      /* leave ref intact */
 371                 VOP_UNLOCK(vp, 0, td);
 372                 error = vfs_allocate_syncvnode(mp);
 373                 vfs_unbusy(mp, td);
 374                 if ((error = VFS_START(mp, 0, td)) != 0)
 375                         vrele(vp);
 376         } else {
 377                 vfs_rm_vnodeops(&mp->mnt_vn_coherency_ops);
 378                 vfs_rm_vnodeops(&mp->mnt_vn_journal_ops);
 379                 vfs_rm_vnodeops(&mp->mnt_vn_norm_ops);
 380                 vfs_rm_vnodeops(&mp->mnt_vn_spec_ops);
 381                 vfs_rm_vnodeops(&mp->mnt_vn_fifo_ops);
 382                 vp->v_flag &= ~VMOUNT;
 383                 mp->mnt_vfc->vfc_refcount--;
 384                 vfs_unbusy(mp, td);
 385                 free(mp, M_MOUNT);
 386                 cache_drop(ncp);
 387                 vput(vp);
 388         }
 389         return (error);
 390 }
 391
 392 /*
 393  * Scan all active processes to see if any of them have a current
 394  * or root directory onto which the new filesystem has just been
 395  * mounted. If so, replace them with the new mount point.
 396  *
 397  * The passed ncp is ref'd and locked (from the mount code) and
 398  * must be associated with the vnode representing the root of the
 399  * mount point.
 400  */
 401 static void
 402 checkdirs(struct vnode *olddp, struct namecache *ncp)
 403 {
 404         struct filedesc *fdp;
 405         struct vnode *newdp;
 406         struct mount *mp;
 407         struct proc *p;
 408
 409         if (olddp->v_usecount == 1)
 410                 return;
 411         mp = olddp->v_mountedhere;
 412         if (VFS_ROOT(mp, &newdp))
 413                 panic("mount: lost mount");
 414         cache_setvp(ncp, newdp);
 415
 416         if (rootvnode == olddp) {
 417                 vref(newdp);
 418                 vfs_cache_setroot(newdp, cache_hold(ncp));
 419         }
 420
 421         FOREACH_PROC_IN_SYSTEM(p) {
 422                 fdp = p->p_fd;
 423                 if (fdp->fd_cdir == olddp) {
 424                         vrele(fdp->fd_cdir);
 425                         vref(newdp);
 426                         fdp->fd_cdir = newdp;
 427                         cache_drop(fdp->fd_ncdir);
 428                         fdp->fd_ncdir = cache_hold(ncp);
 429                 }
 430                 if (fdp->fd_rdir == olddp) {
 431                         vrele(fdp->fd_rdir);
 432                         vref(newdp);
 433                         fdp->fd_rdir = newdp;
 434                         cache_drop(fdp->fd_nrdir);
 435                         fdp->fd_nrdir = cache_hold(ncp);
 436                 }
 437         }
 438         vput(newdp);
 439 }
 440
 441 /*
 442  * Unmount a file system.
 443  *
 444  * Note: unmount takes a path to the vnode mounted on as argument,
 445  * not special file (as before).
 446  */
 447 /*
 448  * umount_args(char *path, int flags)
 449  */
 450 /* ARGSUSED */
 451 int
 452 unmount(struct unmount_args *uap)
 453 {
 454         struct thread *td = curthread;
 455         struct proc *p = td->td_proc;
 456         struct mount *mp = NULL;
 457         int error;
 458         struct nlookupdata nd;
 459
 460         KKASSERT(p);
 461         if (p->p_ucred->cr_prison != NULL)
 462                 return (EPERM);
 463         if (usermount == 0 && (error = suser(td)))
 464                 return (error);
 465
 466         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
 467         if (error == 0)
 468                 error = nlookup(&nd);
 469         if (error)
 470                 goto out;
 471
 472         mp = nd.nl_ncp->nc_mount;
 473
 474         /*
 475          * Only root, or the user that did the original mount is
 476          * permitted to unmount this filesystem.
 477          */
 478         if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) &&
 479             (error = suser(td)))
 480                 goto out;
 481
 482         /*
 483          * Don't allow unmounting the root file system.
 484          */
 485         if (mp->mnt_flag & MNT_ROOTFS) {
 486                 error = EINVAL;
 487                 goto out;
 488         }
 489
 490         /*
 491          * Must be the root of the filesystem
 492          */
 493         if (! (nd.nl_ncp->nc_flag & NCF_MOUNTPT)) {
 494                 error = EINVAL;
 495                 goto out;
 496         }
 497
 498 out:
 499         nlookup_done(&nd);
 500         if (error)
 501                 return (error);
 502         return (dounmount(mp, uap->flags, td));
 503 }
 504
 505 /*
 506  * Do the actual file system unmount.
 507  */
 508 static int
 509 dounmount_interlock(struct mount *mp)
 510 {
 511         if (mp->mnt_kern_flag & MNTK_UNMOUNT)
 512                 return (EBUSY);
 513         mp->mnt_kern_flag |= MNTK_UNMOUNT;
 514         return(0);
 515 }
 516
 517 int
 518 dounmount(struct mount *mp, int flags, struct thread *td)
 519 {
 520         struct vnode *coveredvp;
 521         int error;
 522         int async_flag;
 523
 524         /*
 525          * Exclusive access for unmounting purposes
 526          */
 527         if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0)
 528                 return (error);
 529
 530         /*
 531          * Allow filesystems to detect that a forced unmount is in progress.
 532          */
 533         if (flags & MNT_FORCE)
 534                 mp->mnt_kern_flag |= MNTK_UNMOUNTF;
 535         error = lockmgr(&mp->mnt_lock, LK_DRAIN |
 536             ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), NULL, td);
 537         if (error) {
 538                 mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 539                 if (mp->mnt_kern_flag & MNTK_MWAIT)
 540                         wakeup(mp);
 541                 return (error);
 542         }
 543
 544         if (mp->mnt_flag & MNT_EXPUBLIC)
 545                 vfs_setpublicfs(NULL, NULL, NULL);
 546
 547         vfs_msync(mp, MNT_WAIT);
 548         async_flag = mp->mnt_flag & MNT_ASYNC;
 549         mp->mnt_flag &=~ MNT_ASYNC;
 550         cache_purgevfs(mp);     /* remove cache entries for this file sys */
 551         if (mp->mnt_syncer != NULL)
 552                 vrele(mp->mnt_syncer);
 553         if (((mp->mnt_flag & MNT_RDONLY) ||
 554              (error = VFS_SYNC(mp, MNT_WAIT, td)) == 0) ||
 555             (flags & MNT_FORCE))
 556                 error = VFS_UNMOUNT(mp, flags, td);
 557         if (error) {
 558                 if (mp->mnt_syncer == NULL)
 559                         vfs_allocate_syncvnode(mp);
 560                 mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 561                 mp->mnt_flag |= async_flag;
 562                 lockmgr(&mp->mnt_lock, LK_RELEASE | LK_REENABLE, NULL, td);
 563                 if (mp->mnt_kern_flag & MNTK_MWAIT)
 564                         wakeup(mp);
 565                 return (error);
 566         }
 567         /*
 568          * Clean up any journals still associated with the mount after
 569          * filesystem activity has ceased.
 570          */
 571         journal_remove_all_journals(mp,
 572             ((flags & MNT_FORCE) ? MC_JOURNAL_STOP_IMM : 0));
 573
 574         mountlist_remove(mp);
 575
 576         /*
 577          * Remove any installed vnode ops here so the individual VFSs don't
 578          * have to.
 579          */
 580         vfs_rm_vnodeops(&mp->mnt_vn_coherency_ops);
 581         vfs_rm_vnodeops(&mp->mnt_vn_journal_ops);
 582         vfs_rm_vnodeops(&mp->mnt_vn_norm_ops);
 583         vfs_rm_vnodeops(&mp->mnt_vn_spec_ops);
 584         vfs_rm_vnodeops(&mp->mnt_vn_fifo_ops);
 585
 586         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
 587                 coveredvp->v_mountedhere = NULL;
 588                 vrele(coveredvp);
 589                 cache_drop(mp->mnt_ncp);
 590                 mp->mnt_ncp = NULL;
 591         }
 592         mp->mnt_vfc->vfc_refcount--;
 593         if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
 594                 panic("unmount: dangling vnode");
 595         lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 596         if (mp->mnt_kern_flag & MNTK_MWAIT)
 597                 wakeup(mp);
 598         free(mp, M_MOUNT);
 599         return (0);
 600 }
 601
 602 /*
 603  * Sync each mounted filesystem.
 604  */
 605
 606 #ifdef DEBUG
 607 static int syncprt = 0;
 608 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
 609 #endif /* DEBUG */
 610
 611 static int sync_callback(struct mount *mp, void *data);
 612
 613 /* ARGSUSED */
 614 int
 615 sync(struct sync_args *uap)
 616 {
 617         mountlist_scan(sync_callback, NULL, MNTSCAN_FORWARD);
 618 #ifdef DEBUG
 619         /*
 620          * print out buffer pool stat information on each sync() call.
 621          */
 622         if (syncprt)
 623                 vfs_bufstats();
 624 #endif /* DEBUG */
 625         return (0);
 626 }
 627
 628 static
 629 int
 630 sync_callback(struct mount *mp, void *data __unused)
 631 {
 632         int asyncflag;
 633
 634         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 635                 asyncflag = mp->mnt_flag & MNT_ASYNC;
 636                 mp->mnt_flag &= ~MNT_ASYNC;
 637                 vfs_msync(mp, MNT_NOWAIT);
 638                 VFS_SYNC(mp, MNT_NOWAIT, curthread);
 639                 mp->mnt_flag |= asyncflag;
 640         }
 641         return(0);
 642 }
 643
 644 /* XXX PRISON: could be per prison flag */
 645 static int prison_quotas;
 646 #if 0
 647 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
 648 #endif
 649
 650 /*
 651  *  quotactl_args(char *path, int fcmd, int uid, caddr_t arg)
 652  *
 653  * Change filesystem quotas.
 654  */
 655 /* ARGSUSED */
 656 int
 657 quotactl(struct quotactl_args *uap)
 658 {
 659         struct nlookupdata nd;
 660         struct thread *td;
 661         struct proc *p;
 662         struct mount *mp;
 663         int error;
 664
 665         td = curthread;
 666         p = td->td_proc;
 667         if (p->p_ucred->cr_prison && !prison_quotas)
 668                 return (EPERM);
 669
 670         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
 671         if (error == 0)
 672                 error = nlookup(&nd);
 673         if (error == 0) {
 674                 mp = nd.nl_ncp->nc_mount;
 675                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid,
 676                                     uap->arg, nd.nl_td);
 677         }
 678         nlookup_done(&nd);
 679         return (error);
 680 }
 681
 682 /*
 683  * mountctl(char *path, int op, int fd, const void *ctl, int ctllen,
 684  *              void *buf, int buflen)
 685  *
 686  * This function operates on a mount point and executes the specified
 687  * operation using the specified control data, and possibly returns data.
 688  *
 689  * The actual number of bytes stored in the result buffer is returned, 0
 690  * if none, otherwise an error is returned.
 691  */
 692 /* ARGSUSED */
 693 int
 694 mountctl(struct mountctl_args *uap)
 695 {
 696         struct thread *td = curthread;
 697         struct proc *p = td->td_proc;
 698         struct filedesc *fdp = p->p_fd;
 699         struct file *fp;
 700         void *ctl = NULL;
 701         void *buf = NULL;
 702         char *path = NULL;
 703         int error;
 704
 705         /*
 706          * Sanity and permissions checks.  We must be root.
 707          */
 708         KKASSERT(p);
 709         if (p->p_ucred->cr_prison != NULL)
 710                 return (EPERM);
 711         if ((error = suser(td)) != 0)
 712                 return (error);
 713
 714         /*
 715          * Argument length checks
 716          */
 717         if (uap->ctllen < 0 || uap->ctllen > 1024)
 718                 return (EINVAL);
 719         if (uap->buflen < 0 || uap->buflen > 16 * 1024)
 720                 return (EINVAL);
 721         if (uap->path == NULL)
 722                 return (EINVAL);
 723
 724         /*
 725          * Allocate the necessary buffers and copyin data
 726          */
 727         path = zalloc(namei_zone);
 728         error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
 729         if (error)
 730                 goto done;
 731
 732         if (uap->ctllen) {
 733                 ctl = malloc(uap->ctllen + 1, M_TEMP, M_WAITOK|M_ZERO);
 734                 error = copyin(uap->ctl, ctl, uap->ctllen);
 735                 if (error)
 736                         goto done;
 737         }
 738         if (uap->buflen)
 739                 buf = malloc(uap->buflen + 1, M_TEMP, M_WAITOK|M_ZERO);
 740
 741         /*
 742          * Validate the descriptor
 743          */
 744         if (uap->fd == -1) {
 745                 fp = NULL;
 746         } else if ((u_int)uap->fd >= fdp->fd_nfiles ||
 747             (fp = fdp->fd_files[uap->fd].fp) == NULL) {
 748                 error = EBADF;
 749                 goto done;
 750         }
 751         if (fp)
 752                 fhold(fp);
 753
 754         /*
 755          * Execute the internal kernel function and clean up.
 756          */
 757         error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen, buf, uap->buflen, &uap->sysmsg_result);
 758         if (fp)
 759                 fdrop(fp, td);
 760         if (error == 0 && uap->sysmsg_result > 0)
 761                 error = copyout(buf, uap->buf, uap->sysmsg_result);
 762 done:
 763         if (path)
 764                 zfree(namei_zone, path);
 765         if (ctl)
 766                 free(ctl, M_TEMP);
 767         if (buf)
 768                 free(buf, M_TEMP);
 769         return (error);
 770 }
 771
 772 /*
 773  * Execute a mount control operation by resolving the path to a mount point
 774  * and calling vop_mountctl().
 775  */
 776 int
 777 kern_mountctl(const char *path, int op, struct file *fp,
 778                 const void *ctl, int ctllen,
 779                 void *buf, int buflen, int *res)
 780 {
 781         struct vnode *vp;
 782         struct mount *mp;
 783         struct nlookupdata nd;
 784         int error;
 785
 786         *res = 0;
 787         vp = NULL;
 788         error = nlookup_init(&nd, path, UIO_SYSSPACE, NLC_FOLLOW);
 789         if (error == 0)
 790                 error = nlookup(&nd);
 791         if (error == 0)
 792                 error = cache_vget(nd.nl_ncp, nd.nl_cred, LK_EXCLUSIVE, &vp);
 793         nlookup_done(&nd);
 794         if (error)
 795                 return (error);
 796
 797         mp = vp->v_mount;
 798
 799         /*
 800          * Must be the root of the filesystem
 801          */
 802         if ((vp->v_flag & VROOT) == 0) {
 803                 vput(vp);
 804                 return (EINVAL);
 805         }
 806         error = vop_mountctl(mp->mnt_vn_use_ops, op, fp, ctl, ctllen,
 807                                 buf, buflen, res);
 808         vput(vp);
 809         return (error);
 810 }
 811
 812 int
 813 kern_statfs(struct nlookupdata *nd, struct statfs *buf)
 814 {
 815         struct thread *td = curthread;
 816         struct proc *p = td->td_proc;
 817         struct mount *mp;
 818         struct statfs *sp;
 819         char *fullpath, *freepath;
 820         int error;
 821
 822         if ((error = nlookup(nd)) != 0)
 823                 return (error);
 824         mp = nd->nl_ncp->nc_mount;
 825         sp = &mp->mnt_stat;
 826         if ((error = VFS_STATFS(mp, sp, td)) != 0)
 827                 return (error);
 828
 829         error = cache_fullpath(p, mp->mnt_ncp, &fullpath, &freepath);
 830         if (error)
 831                 return(error);
 832         bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 833         strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
 834         free(freepath, M_TEMP);
 835
 836         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 837         bcopy(sp, buf, sizeof(*buf));
 838         /* Only root should have access to the fsid's. */
 839         if (suser(td))
 840                 buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
 841         return (0);
 842 }
 843
 844 /*
 845  * statfs_args(char *path, struct statfs *buf)
 846  *
 847  * Get filesystem statistics.
 848  */
 849 int
 850 statfs(struct statfs_args *uap)
 851 {
 852         struct nlookupdata nd;
 853         struct statfs buf;
 854         int error;
 855
 856         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
 857         if (error == 0)
 858                 error = kern_statfs(&nd, &buf);
 859         nlookup_done(&nd);
 860         if (error == 0)
 861                 error = copyout(&buf, uap->buf, sizeof(*uap->buf));
 862         return (error);
 863 }
 864
 865 int
 866 kern_fstatfs(int fd, struct statfs *buf)
 867 {
 868         struct thread *td = curthread;
 869         struct proc *p = td->td_proc;
 870         struct file *fp;
 871         struct mount *mp;
 872         struct statfs *sp;
 873         char *fullpath, *freepath;
 874         int error;
 875
 876         KKASSERT(p);
 877         error = getvnode(p->p_fd, fd, &fp);
 878         if (error)
 879                 return (error);
 880         mp = ((struct vnode *)fp->f_data)->v_mount;
 881         if (mp == NULL)
 882                 return (EBADF);
 883         sp = &mp->mnt_stat;
 884         error = VFS_STATFS(mp, sp, td);
 885         if (error)
 886                 return (error);
 887
 888         error = cache_fullpath(p, mp->mnt_ncp, &fullpath, &freepath);
 889         if (error)
 890                 return(error);
 891         bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 892         strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
 893         free(freepath, M_TEMP);
 894
 895         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 896         bcopy(sp, buf, sizeof(*buf));
 897
 898         /* Only root should have access to the fsid's. */
 899         if (suser(td))
 900                 buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
 901         return (0);
 902 }
 903
 904 /*
 905  * fstatfs_args(int fd, struct statfs *buf)
 906  *
 907  * Get filesystem statistics.
 908  */
 909 int
 910 fstatfs(struct fstatfs_args *uap)
 911 {
 912         struct statfs buf;
 913         int error;
 914
 915         error = kern_fstatfs(uap->fd, &buf);
 916
 917         if (error == 0)
 918                 error = copyout(&buf, uap->buf, sizeof(*uap->buf));
 919         return (error);
 920 }
 921
 922 /*
 923  * getfsstat_args(struct statfs *buf, long bufsize, int flags)
 924  *
 925  * Get statistics on all filesystems.
 926  */
 927
 928 struct getfsstat_info {
 929         struct statfs *sfsp;
 930         long count;
 931         long maxcount;
 932         int error;
 933         int flags;
 934         int is_chrooted;
 935         struct thread *td;
 936         struct proc *p;
 937 };
 938
 939 static int getfsstat_callback(struct mount *, void *);
 940
 941 /* ARGSUSED */
 942 int
 943 getfsstat(struct getfsstat_args *uap)
 944 {
 945         struct thread *td = curthread;
 946         struct proc *p = td->td_proc;
 947         struct getfsstat_info info;
 948
 949         bzero(&info, sizeof(info));
 950         if (p != NULL && (p->p_fd->fd_nrdir->nc_flag & NCF_ROOT) == 0)
 951                 info.is_chrooted = 1;
 952         else
 953                 info.is_chrooted = 0;
 954
 955         info.maxcount = uap->bufsize / sizeof(struct statfs);
 956         info.sfsp = uap->buf;
 957         info.count = 0;
 958         info.flags = uap->flags;
 959         info.td = td;
 960         info.p = p;
 961
 962         mountlist_scan(getfsstat_callback, &info, MNTSCAN_FORWARD);
 963         if (info.sfsp && info.count > info.maxcount)
 964                 uap->sysmsg_result = info.maxcount;
 965         else
 966                 uap->sysmsg_result = info.count;
 967         return (info.error);
 968 }
 969
 970 static int
 971 getfsstat_callback(struct mount *mp, void *data)
 972 {
 973         struct getfsstat_info *info = data;
 974         struct statfs *sp;
 975         char *freepath;
 976         char *fullpath;
 977         int error;
 978
 979         if (info->sfsp && info->count < info->maxcount) {
 980                 if (info->is_chrooted && !chroot_visible_mnt(mp, info->p))
 981                         return(0);
 982                 sp = &mp->mnt_stat;
 983
 984                 /*
 985                  * If MNT_NOWAIT or MNT_LAZY is specified, do not
 986                  * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
 987                  * overrides MNT_WAIT.
 988                  */
 989                 if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
 990                     (info->flags & MNT_WAIT)) &&
 991                     (error = VFS_STATFS(mp, sp, info->td))) {
 992                         return(0);
 993                 }
 994                 sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 995
 996                 error = cache_fullpath(info->p, mp->mnt_ncp,
 997                                         &fullpath, &freepath);
 998                 if (error) {
 999                         info->error = error;
1000                         return(-1);
1001                 }
1002                 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1003                 strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1004                 free(freepath, M_TEMP);
1005
1006                 error = copyout(sp, info->sfsp, sizeof(*sp));
1007                 if (error) {
1008                         info->error = error;
1009                         return (-1);
1010                 }
1011                 ++info->sfsp;
1012         }
1013         info->count++;
1014         return(0);
1015 }
1016
1017 /*
1018  * fchdir_args(int fd)
1019  *
1020  * Change current working directory to a given file descriptor.
1021  */
1022 /* ARGSUSED */
1023 int
1024 fchdir(struct fchdir_args *uap)
1025 {
1026         struct thread *td = curthread;
1027         struct proc *p = td->td_proc;
1028         struct filedesc *fdp = p->p_fd;
1029         struct vnode *vp, *ovp;
1030         struct mount *mp;
1031         struct file *fp;
1032         struct namecache *ncp, *oncp;
1033         struct namecache *nct;
1034         int error;
1035
1036         if ((error = getvnode(fdp, uap->fd, &fp)) != 0)
1037                 return (error);
1038         vp = (struct vnode *)fp->f_data;
1039         vref(vp);
1040         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1041         if (vp->v_type != VDIR || fp->f_ncp == NULL)
1042                 error = ENOTDIR;
1043         else
1044                 error = VOP_ACCESS(vp, VEXEC, p->p_ucred, td);
1045         if (error) {
1046                 vput(vp);
1047                 return (error);
1048         }
1049         ncp = cache_hold(fp->f_ncp);
1050         while (!error && (mp = vp->v_mountedhere) != NULL) {
1051                 error = nlookup_mp(mp, &nct);
1052                 if (error == 0) {
1053                         cache_unlock(nct);      /* leave ref intact */
1054                         vput(vp);
1055                         vp = nct->nc_vp;
1056                         error = vget(vp, LK_SHARED, td);
1057                         KKASSERT(error == 0);
1058                         cache_drop(ncp);
1059                         ncp = nct;
1060                 }
1061         }
1062         if (error == 0) {
1063                 ovp = fdp->fd_cdir;
1064                 oncp = fdp->fd_ncdir;
1065                 VOP_UNLOCK(vp, 0, td);  /* leave ref intact */
1066                 fdp->fd_cdir = vp;
1067                 fdp->fd_ncdir = ncp;
1068                 cache_drop(oncp);
1069                 vrele(ovp);
1070         } else {
1071                 cache_drop(ncp);
1072                 vput(vp);
1073         }
1074         return (error);
1075 }
1076
1077 int
1078 kern_chdir(struct nlookupdata *nd)
1079 {
1080         struct thread *td = curthread;
1081         struct proc *p = td->td_proc;
1082         struct filedesc *fdp = p->p_fd;
1083         struct vnode *vp, *ovp;
1084         struct namecache *oncp;
1085         int error;
1086
1087         if ((error = nlookup(nd)) != 0)
1088                 return (error);
1089         if ((vp = nd->nl_ncp->nc_vp) == NULL)
1090                 return (ENOENT);
1091         if ((error = vget(vp, LK_SHARED, td)) != 0)
1092                 return (error);
1093
1094         error = checkvp_chdir(vp, td);
1095         VOP_UNLOCK(vp, 0, td);
1096         if (error == 0) {
1097                 ovp = fdp->fd_cdir;
1098                 oncp = fdp->fd_ncdir;
1099                 cache_unlock(nd->nl_ncp);       /* leave reference intact */
1100                 fdp->fd_ncdir = nd->nl_ncp;
1101                 fdp->fd_cdir = vp;
1102                 cache_drop(oncp);
1103                 vrele(ovp);
1104                 nd->nl_ncp = NULL;
1105         } else {
1106                 vrele(vp);
1107         }
1108         return (error);
1109 }
1110
1111 /*
1112  * chdir_args(char *path)
1113  *
1114  * Change current working directory (``.'').
1115  */
1116 int
1117 chdir(struct chdir_args *uap)
1118 {
1119         struct nlookupdata nd;
1120         int error;
1121
1122         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1123         if (error == 0)
1124                 error = kern_chdir(&nd);
1125         nlookup_done(&nd);
1126         return (error);
1127 }
1128
1129 /*
1130  * Helper function for raised chroot(2) security function:  Refuse if
1131  * any filedescriptors are open directories.
1132  */
1133 static int
1134 chroot_refuse_vdir_fds(fdp)
1135         struct filedesc *fdp;
1136 {
1137         struct vnode *vp;
1138         struct file *fp;
1139         int error;
1140         int fd;
1141
1142         for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
1143                 error = getvnode(fdp, fd, &fp);
1144                 if (error)
1145                         continue;
1146                 vp = (struct vnode *)fp->f_data;
1147                 if (vp->v_type != VDIR)
1148                         continue;
1149                 return(EPERM);
1150         }
1151         return (0);
1152 }
1153
1154 /*
1155  * This sysctl determines if we will allow a process to chroot(2) if it
1156  * has a directory open:
1157  *      0: disallowed for all processes.
1158  *      1: allowed for processes that were not already chroot(2)'ed.
1159  *      2: allowed for all processes.
1160  */
1161
1162 static int chroot_allow_open_directories = 1;
1163
1164 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
1165      &chroot_allow_open_directories, 0, "");
1166
1167 /*
1168  * chroot to the specified namecache entry.  We obtain the vp from the
1169  * namecache data.  The passed ncp must be locked and referenced and will
1170  * remain locked and referenced on return.
1171  */
1172 int
1173 kern_chroot(struct namecache *ncp)
1174 {
1175         struct thread *td = curthread;
1176         struct proc *p = td->td_proc;
1177         struct filedesc *fdp = p->p_fd;
1178         struct vnode *vp;
1179         int error;
1180
1181         /*
1182          * Only root can chroot
1183          */
1184         if ((error = suser_cred(p->p_ucred, PRISON_ROOT)) != 0)
1185                 return (error);
1186
1187         /*
1188          * Disallow open directory descriptors (fchdir() breakouts).
1189          */
1190         if (chroot_allow_open_directories == 0 ||
1191            (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
1192                 if ((error = chroot_refuse_vdir_fds(fdp)) != 0)
1193                         return (error);
1194         }
1195         if ((vp = ncp->nc_vp) == NULL)
1196                 return (ENOENT);
1197
1198         if ((error = vget(vp, LK_SHARED, td)) != 0)
1199                 return (error);
1200
1201         /*
1202          * Check the validity of vp as a directory to change to and
1203          * associate it with rdir/jdir.
1204          */
1205         error = checkvp_chdir(vp, td);
1206         VOP_UNLOCK(vp, 0, td);  /* leave reference intact */
1207         if (error == 0) {
1208                 vrele(fdp->fd_rdir);
1209                 fdp->fd_rdir = vp;      /* reference inherited by fd_rdir */
1210                 cache_drop(fdp->fd_nrdir);
1211                 fdp->fd_nrdir = cache_hold(ncp);
1212                 if (fdp->fd_jdir == NULL) {
1213                         fdp->fd_jdir = vp;
1214                         vref(fdp->fd_jdir);
1215                         fdp->fd_njdir = cache_hold(ncp);
1216                 }
1217         } else {
1218                 vrele(vp);
1219         }
1220         return (error);
1221 }
1222
1223 /*
1224  * chroot_args(char *path)
1225  *
1226  * Change notion of root (``/'') directory.
1227  */
1228 /* ARGSUSED */
1229 int
1230 chroot(struct chroot_args *uap)
1231 {
1232         struct thread *td = curthread;
1233         struct nlookupdata nd;
1234         int error;
1235
1236         KKASSERT(td->td_proc);
1237         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1238         if (error) {
1239                 nlookup_done(&nd);
1240                 return(error);
1241         }
1242         error = nlookup(&nd);
1243         if (error == 0)
1244                 error = kern_chroot(nd.nl_ncp);
1245         nlookup_done(&nd);
1246         return(error);
1247 }
1248
1249 /*
1250  * Common routine for chroot and chdir.  Given a locked, referenced vnode,
1251  * determine whether it is legal to chdir to the vnode.  The vnode's state
1252  * is not changed by this call.
1253  */
1254 int
1255 checkvp_chdir(struct vnode *vp, struct thread *td)
1256 {
1257         int error;
1258
1259         if (vp->v_type != VDIR)
1260                 error = ENOTDIR;
1261         else
1262                 error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred, td);
1263         return (error);
1264 }
1265
1266 int
1267 kern_open(struct nlookupdata *nd, int oflags, int mode, int *res)
1268 {
1269         struct thread *td = curthread;
1270         struct proc *p = td->td_proc;
1271         struct lwp *lp = td->td_lwp;
1272         struct filedesc *fdp = p->p_fd;
1273         int cmode, flags;
1274         struct file *nfp;
1275         struct file *fp;
1276         struct vnode *vp;
1277         int type, indx, error;
1278         struct flock lf;
1279
1280         if ((oflags & O_ACCMODE) == O_ACCMODE)
1281                 return (EINVAL);
1282         flags = FFLAGS(oflags);
1283         error = falloc(p, &nfp, NULL);
1284         if (error)
1285                 return (error);
1286         fp = nfp;
1287         cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
1288
1289         /*
1290          * XXX p_dupfd is a real mess.  It allows a device to return a
1291          * file descriptor to be duplicated rather then doing the open
1292          * itself.
1293          */
1294         lp->lwp_dupfd = -1;
1295
1296         /*
1297          * Call vn_open() to do the lookup and assign the vnode to the
1298          * file pointer.  vn_open() does not change the ref count on fp
1299          * and the vnode, on success, will be inherited by the file pointer
1300          * and unlocked.
1301          */
1302         nd->nl_flags |= NLC_LOCKVP;
1303         error = vn_open(nd, fp, flags, cmode);
1304         nlookup_done(nd);
1305         if (error) {
1306                 /*
1307                  * handle special fdopen() case.  bleh.  dupfdopen() is
1308                  * responsible for dropping the old contents of ofiles[indx]
1309                  * if it succeeds.
1310                  *
1311                  * Note that if fsetfd() succeeds it will add a ref to fp
1312                  * which represents the fd_files[] assignment.  We must still
1313                  * drop our reference.
1314                  */
1315                 if ((error == ENODEV || error == ENXIO) && lp->lwp_dupfd >= 0) {
1316                         if (fsetfd(p, fp, &indx) == 0) {
1317                                 error = dupfdopen(fdp, indx, lp->lwp_dupfd, flags, error);
1318                                 if (error == 0) {
1319                                         *res = indx;
1320                                         fdrop(fp, td);  /* our ref */
1321                                         return (0);
1322                                 }
1323                                 if (fdp->fd_files[indx].fp == fp) {
1324                                         funsetfd(fdp, indx);
1325                                         fdrop(fp, td);  /* fd_files[] ref */
1326                                 }
1327                         }
1328                 }
1329                 fdrop(fp, td);  /* our ref */
1330                 if (error == ERESTART)
1331                         error = EINTR;
1332                 return (error);
1333         }
1334
1335         /*
1336          * ref the vnode for ourselves so it can't be ripped out from under
1337          * is.  XXX need an ND flag to request that the vnode be returned
1338          * anyway.
1339          */
1340         vp = (struct vnode *)fp->f_data;
1341         vref(vp);
1342         if ((error = fsetfd(p, fp, &indx)) != 0) {
1343                 fdrop(fp, td);
1344                 vrele(vp);
1345                 return (error);
1346         }
1347
1348         /*
1349          * If no error occurs the vp will have been assigned to the file
1350          * pointer.
1351          */
1352         lp->lwp_dupfd = 0;
1353
1354         /*
1355          * There should be 2 references on the file, one from the descriptor
1356          * table, and one for us.
1357          *
1358          * Handle the case where someone closed the file (via its file
1359          * descriptor) while we were blocked.  The end result should look
1360          * like opening the file succeeded but it was immediately closed.
1361          */
1362         if (fp->f_count == 1) {
1363                 KASSERT(fdp->fd_files[indx].fp != fp,
1364                     ("Open file descriptor lost all refs"));
1365                 vrele(vp);
1366                 fo_close(fp, td);
1367                 fdrop(fp, td);
1368                 *res = indx;
1369                 return 0;
1370         }
1371
1372         if (flags & (O_EXLOCK | O_SHLOCK)) {
1373                 lf.l_whence = SEEK_SET;
1374                 lf.l_start = 0;
1375                 lf.l_len = 0;
1376                 if (flags & O_EXLOCK)
1377                         lf.l_type = F_WRLCK;
1378                 else
1379                         lf.l_type = F_RDLCK;
1380                 type = F_FLOCK;
1381                 if ((flags & FNONBLOCK) == 0)
1382                         type |= F_WAIT;
1383
1384                 if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
1385                         /*
1386                          * lock request failed.  Normally close the descriptor
1387                          * but handle the case where someone might have dup()d
1388                          * it when we weren't looking.  One reference is
1389                          * owned by the descriptor array, the other by us.
1390                          */
1391                         vrele(vp);
1392                         if (fdp->fd_files[indx].fp == fp) {
1393                                 funsetfd(fdp, indx);
1394                                 fdrop(fp, td);
1395                         }
1396                         fdrop(fp, td);
1397                         return (error);
1398                 }
1399                 fp->f_flag |= FHASLOCK;
1400         }
1401         /* assert that vn_open created a backing object if one is needed */
1402         KASSERT(!vn_canvmio(vp) || VOP_GETVOBJECT(vp, NULL) == 0,
1403                 ("open: vmio vnode has no backing object after vn_open"));
1404
1405         vrele(vp);
1406
1407         /*
1408          * release our private reference, leaving the one associated with the
1409          * descriptor table intact.
1410          */
1411         fdrop(fp, td);
1412         *res = indx;
1413         return (0);
1414 }
1415
1416 /*
1417  * open_args(char *path, int flags, int mode)
1418  *
1419  * Check permissions, allocate an open file structure,
1420  * and call the device open routine if any.
1421  */
1422 int
1423 open(struct open_args *uap)
1424 {
1425         struct nlookupdata nd;
1426         int error;
1427
1428         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1429         if (error == 0) {
1430                 error = kern_open(&nd, uap->flags,
1431                                     uap->mode, &uap->sysmsg_result);
1432         }
1433         nlookup_done(&nd);
1434         return (error);
1435 }
1436
1437 int
1438 kern_mknod(struct nlookupdata *nd, int mode, int dev)
1439 {
1440         struct namecache *ncp;
1441         struct thread *td = curthread;
1442         struct proc *p = td->td_proc;
1443         struct vnode *vp;
1444         struct vattr vattr;
1445         int error;
1446         int whiteout = 0;
1447
1448         KKASSERT(p);
1449
1450         switch (mode & S_IFMT) {
1451         case S_IFCHR:
1452         case S_IFBLK:
1453                 error = suser(td);
1454                 break;
1455         default:
1456                 error = suser_cred(p->p_ucred, PRISON_ROOT);
1457                 break;
1458         }
1459         if (error)
1460                 return (error);
1461
1462         bwillwrite();
1463         nd->nl_flags |= NLC_CREATE;
1464         if ((error = nlookup(nd)) != 0)
1465                 return (error);
1466         ncp = nd->nl_ncp;
1467         if (ncp->nc_vp)
1468                 return (EEXIST);
1469
1470         VATTR_NULL(&vattr);
1471         vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
1472         vattr.va_rdev = dev;
1473         whiteout = 0;
1474
1475         switch (mode & S_IFMT) {
1476         case S_IFMT:    /* used by badsect to flag bad sectors */
1477                 vattr.va_type = VBAD;
1478                 break;
1479         case S_IFCHR:
1480                 vattr.va_type = VCHR;
1481                 break;
1482         case S_IFBLK:
1483                 vattr.va_type = VBLK;
1484                 break;
1485         case S_IFWHT:
1486                 whiteout = 1;
1487                 break;
1488         default:
1489                 error = EINVAL;
1490                 break;
1491         }
1492         if (error == 0) {
1493                 if (whiteout) {
1494                         error = VOP_NWHITEOUT(ncp, nd->nl_cred, NAMEI_CREATE);
1495                 } else {
1496                         vp = NULL;
1497                         error = VOP_NMKNOD(ncp, &vp, nd->nl_cred, &vattr);
1498                         if (error == 0)
1499                                 vput(vp);
1500                 }
1501         }
1502         return (error);
1503 }
1504
1505 /*
1506  * mknod_args(char *path, int mode, int dev)
1507  *
1508  * Create a special file.
1509  */
1510 int
1511 mknod(struct mknod_args *uap)
1512 {
1513         struct nlookupdata nd;
1514         int error;
1515
1516         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1517         if (error == 0)
1518                 error = kern_mknod(&nd, uap->mode, uap->dev);
1519         nlookup_done(&nd);
1520         return (error);
1521 }
1522
1523 int
1524 kern_mkfifo(struct nlookupdata *nd, int mode)
1525 {
1526         struct namecache *ncp;
1527         struct thread *td = curthread;
1528         struct proc *p = td->td_proc;
1529         struct vattr vattr;
1530         struct vnode *vp;
1531         int error;
1532
1533         bwillwrite();
1534
1535         nd->nl_flags |= NLC_CREATE;
1536         if ((error = nlookup(nd)) != 0)
1537                 return (error);
1538         ncp = nd->nl_ncp;
1539         if (ncp->nc_vp)
1540                 return (EEXIST);
1541
1542         VATTR_NULL(&vattr);
1543         vattr.va_type = VFIFO;
1544         vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
1545         vp = NULL;
1546         error = VOP_NMKNOD(ncp, &vp, nd->nl_cred, &vattr);
1547         if (error == 0)
1548                 vput(vp);
1549         return (error);
1550 }
1551
1552 /*
1553  * mkfifo_args(char *path, int mode)
1554  *
1555  * Create a named pipe.
1556  */
1557 int
1558 mkfifo(struct mkfifo_args *uap)
1559 {
1560         struct nlookupdata nd;
1561         int error;
1562
1563         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1564         if (error == 0)
1565                 error = kern_mkfifo(&nd, uap->mode);
1566         nlookup_done(&nd);
1567         return (error);
1568 }
1569
1570 static int hardlink_check_uid = 0;
1571 SYSCTL_INT(_kern, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1572     &hardlink_check_uid, 0,
1573     "Unprivileged processes cannot create hard links to files owned by other "
1574     "users");
1575 static int hardlink_check_gid = 0;
1576 SYSCTL_INT(_kern, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1577     &hardlink_check_gid, 0,
1578     "Unprivileged processes cannot create hard links to files owned by other "
1579     "groups");
1580
1581 static int
1582 can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
1583 {
1584         struct vattr va;
1585         int error;
1586
1587         /*
1588          * Shortcut if disabled
1589          */
1590         if (hardlink_check_uid == 0 && hardlink_check_gid == 0)
1591                 return (0);
1592
1593         /*
1594          * root cred can always hardlink
1595          */
1596         if (suser_cred(cred, PRISON_ROOT) == 0)
1597                 return (0);
1598
1599         /*
1600          * Otherwise only if the originating file is owned by the
1601          * same user or group.  Note that any group is allowed if
1602          * the file is owned by the caller.
1603          */
1604         error = VOP_GETATTR(vp, &va, td);
1605         if (error != 0)
1606                 return (error);
1607
1608         if (hardlink_check_uid) {
1609                 if (cred->cr_uid != va.va_uid)
1610                         return (EPERM);
1611         }
1612
1613         if (hardlink_check_gid) {
1614                 if (cred->cr_uid != va.va_uid && !groupmember(va.va_gid, cred))
1615                         return (EPERM);
1616         }
1617
1618         return (0);
1619 }
1620
1621 int
1622 kern_link(struct nlookupdata *nd, struct nlookupdata *linknd)
1623 {
1624         struct thread *td = curthread;
1625         struct vnode *vp;
1626         int error;
1627
1628         /*
1629          * Lookup the source and obtained a locked vnode.
1630          *
1631          * XXX relookup on vget failure / race ?
1632          */
1633         bwillwrite();
1634         if ((error = nlookup(nd)) != 0)
1635                 return (error);
1636         vp = nd->nl_ncp->nc_vp;
1637         KKASSERT(vp != NULL);
1638         if (vp->v_type == VDIR)
1639                 return (EPERM);         /* POSIX */
1640         if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0)
1641                 return (error);
1642
1643         /*
1644          * Unlock the source so we can lookup the target without deadlocking
1645          * (XXX vp is locked already, possible other deadlock?).  The target
1646          * must not exist.
1647          */
1648         KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
1649         nd->nl_flags &= ~NLC_NCPISLOCKED;
1650         cache_unlock(nd->nl_ncp);
1651
1652         linknd->nl_flags |= NLC_CREATE;
1653         if ((error = nlookup(linknd)) != 0) {
1654                 vput(vp);
1655                 return (error);
1656         }
1657         if (linknd->nl_ncp->nc_vp) {
1658                 vput(vp);
1659                 return (EEXIST);
1660         }
1661
1662         /*
1663          * Finally run the new API VOP.
1664          */
1665         error = can_hardlink(vp, td, td->td_proc->p_ucred);
1666         if (error == 0)
1667                 error = VOP_NLINK(linknd->nl_ncp, vp, linknd->nl_cred);
1668         vput(vp);
1669         return (error);
1670 }
1671
1672 /*
1673  * link_args(char *path, char *link)
1674  *
1675  * Make a hard file link.
1676  */
1677 int
1678 link(struct link_args *uap)
1679 {
1680         struct nlookupdata nd, linknd;
1681         int error;
1682
1683         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1684         if (error == 0) {
1685                 error = nlookup_init(&linknd, uap->link, UIO_USERSPACE, 0);
1686                 if (error == 0)
1687                         error = kern_link(&nd, &linknd);
1688                 nlookup_done(&linknd);
1689         }
1690         nlookup_done(&nd);
1691         return (error);
1692 }
1693
1694 int
1695 kern_symlink(struct nlookupdata *nd, char *path, int mode)
1696 {
1697         struct namecache *ncp;
1698         struct vattr vattr;
1699         struct vnode *vp;
1700         int error;
1701
1702         bwillwrite();
1703         nd->nl_flags |= NLC_CREATE;
1704         if ((error = nlookup(nd)) != 0)
1705                 return (error);
1706         ncp = nd->nl_ncp;
1707         if (ncp->nc_vp)
1708                 return (EEXIST);
1709
1710         VATTR_NULL(&vattr);
1711         vattr.va_mode = mode;
1712         error = VOP_NSYMLINK(ncp, &vp, nd->nl_cred, &vattr, path);
1713         if (error == 0)
1714                 vput(vp);
1715         return (error);
1716 }
1717
1718 /*
1719  * symlink(char *path, char *link)
1720  *
1721  * Make a symbolic link.
1722  */
1723 int
1724 symlink(struct symlink_args *uap)
1725 {
1726         struct thread *td = curthread;
1727         struct nlookupdata nd;
1728         char *path;
1729         int error;
1730         int mode;
1731
1732         path = zalloc(namei_zone);
1733         error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
1734         if (error == 0) {
1735                 error = nlookup_init(&nd, uap->link, UIO_USERSPACE, 0);
1736                 if (error == 0) {
1737                         mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
1738                         error = kern_symlink(&nd, path, mode);
1739                 }
1740                 nlookup_done(&nd);
1741         }
1742         zfree(namei_zone, path);
1743         return (error);
1744 }
1745
1746 /*
1747  * undelete_args(char *path)
1748  *
1749  * Delete a whiteout from the filesystem.
1750  */
1751 /* ARGSUSED */
1752 int
1753 undelete(struct undelete_args *uap)
1754 {
1755         struct nlookupdata nd;
1756         int error;
1757
1758         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1759         bwillwrite();
1760         nd.nl_flags |= NLC_DELETE;
1761         if (error == 0)
1762                 error = nlookup(&nd);
1763         if (error == 0)
1764                 error = VOP_NWHITEOUT(nd.nl_ncp, nd.nl_cred, NAMEI_DELETE);
1765         nlookup_done(&nd);
1766         return (error);
1767 }
1768
1769 int
1770 kern_unlink(struct nlookupdata *nd)
1771 {
1772         struct namecache *ncp;
1773         int error;
1774
1775         bwillwrite();
1776         nd->nl_flags |= NLC_DELETE;
1777         if ((error = nlookup(nd)) != 0)
1778                 return (error);
1779         ncp = nd->nl_ncp;
1780         error = VOP_NREMOVE(ncp, nd->nl_cred);
1781         return (error);
1782 }
1783
1784 /*
1785  * unlink_args(char *path)
1786  *
1787  * Delete a name from the filesystem.
1788  */
1789 int
1790 unlink(struct unlink_args *uap)
1791 {
1792         struct nlookupdata nd;
1793         int error;
1794
1795         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1796         if (error == 0)
1797                 error = kern_unlink(&nd);
1798         nlookup_done(&nd);
1799         return (error);
1800 }
1801
1802 int
1803 kern_lseek(int fd, off_t offset, int whence, off_t *res)
1804 {
1805         struct thread *td = curthread;
1806         struct proc *p = td->td_proc;
1807         struct filedesc *fdp = p->p_fd;
1808         struct file *fp;
1809         struct vattr vattr;
1810         int error;
1811
1812         if ((u_int)fd >= fdp->fd_nfiles ||
1813             (fp = fdp->fd_files[fd].fp) == NULL)
1814                 return (EBADF);
1815         if (fp->f_type != DTYPE_VNODE)
1816                 return (ESPIPE);
1817         switch (whence) {
1818         case L_INCR:
1819                 fp->f_offset += offset;
1820                 break;
1821         case L_XTND:
1822                 error=VOP_GETATTR((struct vnode *)fp->f_data, &vattr, td);
1823                 if (error)
1824                         return (error);
1825                 fp->f_offset = offset + vattr.va_size;
1826                 break;
1827         case L_SET:
1828                 fp->f_offset = offset;
1829                 break;
1830         default:
1831                 return (EINVAL);
1832         }
1833         *res = fp->f_offset;
1834         return (0);
1835 }
1836
1837 /*
1838  * lseek_args(int fd, int pad, off_t offset, int whence)
1839  *
1840  * Reposition read/write file offset.
1841  */
1842 int
1843 lseek(struct lseek_args *uap)
1844 {
1845         int error;
1846
1847         error = kern_lseek(uap->fd, uap->offset, uap->whence,
1848             &uap->sysmsg_offset);
1849
1850         return (error);
1851 }
1852
1853 int
1854 kern_access(struct nlookupdata *nd, int aflags)
1855 {
1856         struct thread *td = curthread;
1857         struct vnode *vp;
1858         int error, flags;
1859
1860         if ((error = nlookup(nd)) != 0)
1861                 return (error);
1862 retry:
1863         error = cache_vget(nd->nl_ncp, nd->nl_cred, LK_EXCLUSIVE, &vp);
1864         if (error)
1865                 return (error);
1866
1867         /* Flags == 0 means only check for existence. */
1868         if (aflags) {
1869                 flags = 0;
1870                 if (aflags & R_OK)
1871                         flags |= VREAD;
1872                 if (aflags & W_OK)
1873                         flags |= VWRITE;
1874                 if (aflags & X_OK)
1875                         flags |= VEXEC;
1876                 if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
1877                         error = VOP_ACCESS(vp, flags, nd->nl_cred, td);
1878
1879                 /*
1880                  * If the file handle is stale we have to re-resolve the
1881                  * entry.  This is a hack at the moment.
1882                  */
1883                 if (error == ESTALE) {
1884                         cache_setunresolved(nd->nl_ncp);
1885                         error = cache_resolve(nd->nl_ncp, nd->nl_cred);
1886                         if (error == 0) {
1887                                 vput(vp);
1888                                 vp = NULL;
1889                                 goto retry;
1890                         }
1891                 }
1892         }
1893         vput(vp);
1894         return (error);
1895 }
1896
1897 /*
1898  * access_args(char *path, int flags)
1899  *
1900  * Check access permissions.
1901  */
1902 int
1903 access(struct access_args *uap)
1904 {
1905         struct nlookupdata nd;
1906         int error;
1907
1908         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1909         if (error == 0)
1910                 error = kern_access(&nd, uap->flags);
1911         nlookup_done(&nd);
1912         return (error);
1913 }
1914
1915 int
1916 kern_stat(struct nlookupdata *nd, struct stat *st)
1917 {
1918         int error;
1919         struct vnode *vp;
1920         thread_t td;
1921
1922         if ((error = nlookup(nd)) != 0)
1923                 return (error);
1924 again:
1925         if ((vp = nd->nl_ncp->nc_vp) == NULL)
1926                 return (ENOENT);
1927
1928         td = curthread;
1929         if ((error = vget(vp, LK_SHARED, td)) != 0)
1930                 return (error);
1931         error = vn_stat(vp, st, td);
1932
1933         /*
1934          * If the file handle is stale we have to re-resolve the entry.  This
1935          * is a hack at the moment.
1936          */
1937         if (error == ESTALE) {
1938                 cache_setunresolved(nd->nl_ncp);
1939                 error = cache_resolve(nd->nl_ncp, nd->nl_cred);
1940                 if (error == 0) {
1941                         vput(vp);
1942                         goto again;
1943                 }
1944         }
1945         vput(vp);
1946         return (error);
1947 }
1948
1949 /*
1950  * stat_args(char *path, struct stat *ub)
1951  *
1952  * Get file status; this version follows links.
1953  */
1954 int
1955 stat(struct stat_args *uap)
1956 {
1957         struct nlookupdata nd;
1958         struct stat st;
1959         int error;
1960
1961         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1962         if (error == 0) {
1963                 error = kern_stat(&nd, &st);
1964                 if (error == 0)
1965                         error = copyout(&st, uap->ub, sizeof(*uap->ub));
1966         }
1967         nlookup_done(&nd);
1968         return (error);
1969 }
1970
1971 /*
1972  * lstat_args(char *path, struct stat *ub)
1973  *
1974  * Get file status; this version does not follow links.
1975  */
1976 int
1977 lstat(struct lstat_args *uap)
1978 {
1979         struct nlookupdata nd;
1980         struct stat st;
1981         int error;
1982
1983         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1984         if (error == 0) {
1985                 error = kern_stat(&nd, &st);
1986                 if (error == 0)
1987                         error = copyout(&st, uap->ub, sizeof(*uap->ub));
1988         }
1989         nlookup_done(&nd);
1990         return (error);
1991 }
1992
1993 /*
1994  * pathconf_Args(char *path, int name)
1995  *
1996  * Get configurable pathname variables.
1997  */
1998 /* ARGSUSED */
1999 int
2000 pathconf(struct pathconf_args *uap)
2001 {
2002         struct nlookupdata nd;
2003         struct vnode *vp;
2004         int error;
2005
2006         vp = NULL;
2007         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2008         if (error == 0)
2009                 error = nlookup(&nd);
2010         if (error == 0)
2011                 error = cache_vget(nd.nl_ncp, nd.nl_cred, LK_EXCLUSIVE, &vp);
2012         nlookup_done(&nd);
2013         if (error == 0) {
2014                 error = VOP_PATHCONF(vp, uap->name, uap->sysmsg_fds);
2015                 vput(vp);
2016         }
2017         return (error);
2018 }
2019
2020 /*
2021  * XXX: daver
2022  * kern_readlink isn't properly split yet.  There is a copyin burried
2023  * in VOP_READLINK().
2024  */
2025 int
2026 kern_readlink(struct nlookupdata *nd, char *buf, int count, int *res)
2027 {
2028         struct thread *td = curthread;
2029         struct proc *p = td->td_proc;
2030         struct vnode *vp;
2031         struct iovec aiov;
2032         struct uio auio;
2033         int error;
2034
2035         if ((error = nlookup(nd)) != 0)
2036                 return (error);
2037         error = cache_vget(nd->nl_ncp, nd->nl_cred, LK_EXCLUSIVE, &vp);
2038         if (error)
2039                 return (error);
2040         if (vp->v_type != VLNK) {
2041                 error = EINVAL;
2042         } else {
2043                 aiov.iov_base = buf;
2044                 aiov.iov_len = count;
2045                 auio.uio_iov = &aiov;
2046                 auio.uio_iovcnt = 1;
2047                 auio.uio_offset = 0;
2048                 auio.uio_rw = UIO_READ;
2049                 auio.uio_segflg = UIO_USERSPACE;
2050                 auio.uio_td = td;
2051                 auio.uio_resid = count;
2052                 error = VOP_READLINK(vp, &auio, p->p_ucred);
2053         }
2054         vput(vp);
2055         *res = count - auio.uio_resid;
2056         return (error);
2057 }
2058
2059 /*
2060  * readlink_args(char *path, char *buf, int count)
2061  *
2062  * Return target name of a symbolic link.
2063  */
2064 int
2065 readlink(struct readlink_args *uap)
2066 {
2067         struct nlookupdata nd;
2068         int error;
2069
2070         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2071         if (error == 0) {
2072                 error = kern_readlink(&nd, uap->buf, uap->count,
2073                                         &uap->sysmsg_result);
2074         }
2075         nlookup_done(&nd);
2076         return (error);
2077 }
2078
2079 static int
2080 setfflags(struct vnode *vp, int flags)
2081 {
2082         struct thread *td = curthread;
2083         struct proc *p = td->td_proc;
2084         int error;
2085         struct vattr vattr;
2086
2087         /*
2088          * Prevent non-root users from setting flags on devices.  When
2089          * a device is reused, users can retain ownership of the device
2090          * if they are allowed to set flags and programs assume that
2091          * chown can't fail when done as root.
2092          */
2093         if ((vp->v_type == VCHR || vp->v_type == VBLK) &&
2094             ((error = suser_cred(p->p_ucred, PRISON_ROOT)) != 0))
2095                 return (error);
2096
2097         /*
2098          * note: vget is required for any operation that might mod the vnode
2099          * so VINACTIVE is properly cleared.
2100          */
2101         VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE);
2102         if ((error = vget(vp, LK_EXCLUSIVE, td)) == 0) {
2103                 VATTR_NULL(&vattr);
2104                 vattr.va_flags = flags;
2105                 error = VOP_SETATTR(vp, &vattr, p->p_ucred, td);
2106                 vput(vp);
2107         }
2108         return (error);
2109 }
2110
2111 /*
2112  * chflags(char *path, int flags)
2113  *
2114  * Change flags of a file given a path name.
2115  */
2116 /* ARGSUSED */
2117 int
2118 chflags(struct chflags_args *uap)
2119 {
2120         struct nlookupdata nd;
2121         struct vnode *vp;
2122         int error;
2123
2124         vp = NULL;
2125         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2126         /* XXX Add NLC flag indicating modifying operation? */
2127         if (error == 0)
2128                 error = nlookup(&nd);
2129         if (error == 0)
2130                 error = cache_vref(nd.nl_ncp, nd.nl_cred, &vp);
2131         nlookup_done(&nd);
2132         if (error == 0) {
2133                 error = setfflags(vp, uap->flags);
2134                 vrele(vp);
2135         }
2136         return (error);
2137 }
2138
2139 /*
2140  * fchflags_args(int fd, int flags)
2141  *
2142  * Change flags of a file given a file descriptor.
2143  */
2144 /* ARGSUSED */
2145 int
2146 fchflags(struct fchflags_args *uap)
2147 {
2148         struct thread *td = curthread;
2149         struct proc *p = td->td_proc;
2150         struct file *fp;
2151         int error;
2152
2153         if ((error = getvnode(p->p_fd, uap->fd, &fp)) != 0)
2154                 return (error);
2155         return setfflags((struct vnode *) fp->f_data, uap->flags);
2156 }
2157
2158 static int
2159 setfmode(struct vnode *vp, int mode)
2160 {
2161         struct thread *td = curthread;
2162         struct proc *p = td->td_proc;
2163         int error;
2164         struct vattr vattr;
2165
2166         /*
2167          * note: vget is required for any operation that might mod the vnode
2168          * so VINACTIVE is properly cleared.
2169          */
2170         VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE);
2171         if ((error = vget(vp, LK_EXCLUSIVE, td)) == 0) {
2172                 VATTR_NULL(&vattr);
2173                 vattr.va_mode = mode & ALLPERMS;
2174                 error = VOP_SETATTR(vp, &vattr, p->p_ucred, td);
2175                 vput(vp);
2176         }
2177         return error;
2178 }
2179
2180 int
2181 kern_chmod(struct nlookupdata *nd, int mode)
2182 {
2183         struct vnode *vp;
2184         int error;
2185
2186         /* XXX Add NLC flag indicating modifying operation? */
2187         if ((error = nlookup(nd)) != 0)
2188                 return (error);
2189         if ((error = cache_vref(nd->nl_ncp, nd->nl_cred, &vp)) != 0)
2190                 return (error);
2191         error = setfmode(vp, mode);
2192         vrele(vp);
2193         return (error);
2194 }
2195
2196 /*
2197  * chmod_args(char *path, int mode)
2198  *
2199  * Change mode of a file given path name.
2200  */
2201 /* ARGSUSED */
2202 int
2203 chmod(struct chmod_args *uap)
2204 {
2205         struct nlookupdata nd;
2206         int error;
2207
2208         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2209         if (error == 0)
2210                 error = kern_chmod(&nd, uap->mode);
2211         nlookup_done(&nd);
2212         return (error);
2213 }
2214
2215 /*
2216  * lchmod_args(char *path, int mode)
2217  *
2218  * Change mode of a file given path name (don't follow links.)
2219  */
2220 /* ARGSUSED */
2221 int
2222 lchmod(struct lchmod_args *uap)
2223 {
2224         struct nlookupdata nd;
2225         int error;
2226
2227         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2228         if (error == 0)
2229                 error = kern_chmod(&nd, uap->mode);
2230         nlookup_done(&nd);
2231         return (error);
2232 }
2233
2234 /*
2235  * fchmod_args(int fd, int mode)
2236  *
2237  * Change mode of a file given a file descriptor.
2238  */
2239 /* ARGSUSED */
2240 int
2241 fchmod(struct fchmod_args *uap)
2242 {
2243         struct thread *td = curthread;
2244         struct proc *p = td->td_proc;
2245         struct file *fp;
2246         int error;
2247
2248         if ((error = getvnode(p->p_fd, uap->fd, &fp)) != 0)
2249                 return (error);
2250         return setfmode((struct vnode *)fp->f_data, uap->mode);
2251 }
2252
2253 static int
2254 setfown(struct vnode *vp, uid_t uid, gid_t gid)
2255 {
2256         struct thread *td = curthread;
2257         struct proc *p = td->td_proc;
2258         int error;
2259         struct vattr vattr;
2260
2261         /*
2262          * note: vget is required for any operation that might mod the vnode
2263          * so VINACTIVE is properly cleared.
2264          */
2265         VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE);
2266         if ((error = vget(vp, LK_EXCLUSIVE, td)) == 0) {
2267                 VATTR_NULL(&vattr);
2268                 vattr.va_uid = uid;
2269                 vattr.va_gid = gid;
2270                 error = VOP_SETATTR(vp, &vattr, p->p_ucred, td);
2271                 vput(vp);
2272         }
2273         return error;
2274 }
2275
2276 int
2277 kern_chown(struct nlookupdata *nd, int uid, int gid)
2278 {
2279         struct vnode *vp;
2280         int error;
2281
2282         /* XXX Add NLC flag indicating modifying operation? */
2283         if ((error = nlookup(nd)) != 0)
2284                 return (error);
2285         if ((error = cache_vref(nd->nl_ncp, nd->nl_cred, &vp)) != 0)
2286                 return (error);
2287         error = setfown(vp, uid, gid);
2288         vrele(vp);
2289         return (error);
2290 }
2291
2292 /*
2293  * chown(char *path, int uid, int gid)
2294  *
2295  * Set ownership given a path name.
2296  */
2297 int
2298 chown(struct chown_args *uap)
2299 {
2300         struct nlookupdata nd;
2301         int error;
2302
2303         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2304         if (error == 0)
2305                 error = kern_chown(&nd, uap->uid, uap->gid);
2306         nlookup_done(&nd);
2307         return (error);
2308 }
2309
2310 /*
2311  * lchown_args(char *path, int uid, int gid)
2312  *
2313  * Set ownership given a path name, do not cross symlinks.
2314  */
2315 int
2316 lchown(struct lchown_args *uap)
2317 {
2318         struct nlookupdata nd;
2319         int error;
2320
2321         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2322         if (error == 0)
2323                 error = kern_chown(&nd, uap->uid, uap->gid);
2324         nlookup_done(&nd);
2325         return (error);
2326 }
2327
2328 /*
2329  * fchown_args(int fd, int uid, int gid)
2330  *
2331  * Set ownership given a file descriptor.
2332  */
2333 /* ARGSUSED */
2334 int
2335 fchown(struct fchown_args *uap)
2336 {
2337         struct thread *td = curthread;
2338         struct proc *p = td->td_proc;
2339         struct file *fp;
2340         int error;
2341
2342         if ((error = getvnode(p->p_fd, uap->fd, &fp)) != 0)
2343                 return (error);
2344         return setfown((struct vnode *)fp->f_data,
2345                 uap->uid, uap->gid);
2346 }
2347
2348 static int
2349 getutimes(const struct timeval *tvp, struct timespec *tsp)
2350 {
2351         struct timeval tv[2];
2352
2353         if (tvp == NULL) {
2354                 microtime(&tv[0]);
2355                 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
2356                 tsp[1] = tsp[0];
2357         } else {
2358                 TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
2359                 TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
2360         }
2361         return 0;
2362 }
2363
2364 static int
2365 setutimes(struct vnode *vp, const struct timespec *ts, int nullflag)
2366 {
2367         struct thread *td = curthread;
2368         struct proc *p = td->td_proc;
2369         int error;
2370         struct vattr vattr;
2371
2372         /*
2373          * note: vget is required for any operation that might mod the vnode
2374          * so VINACTIVE is properly cleared.
2375          */
2376         VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE);
2377         if ((error = vget(vp, LK_EXCLUSIVE, td)) == 0) {
2378                 VATTR_NULL(&vattr);
2379                 vattr.va_atime = ts[0];
2380                 vattr.va_mtime = ts[1];
2381                 if (nullflag)
2382                         vattr.va_vaflags |= VA_UTIMES_NULL;
2383                 error = VOP_SETATTR(vp, &vattr, p->p_ucred, td);
2384                 vput(vp);
2385         }
2386         return error;
2387 }
2388
2389 int
2390 kern_utimes(struct nlookupdata *nd, struct timeval *tptr)
2391 {
2392         struct timespec ts[2];
2393         struct vnode *vp;
2394         int error;
2395
2396         if ((error = getutimes(tptr, ts)) != 0)
2397                 return (error);
2398         /* XXX Add NLC flag indicating modifying operation? */
2399         if ((error = nlookup(nd)) != 0)
2400                 return (error);
2401         if ((error = cache_vref(nd->nl_ncp, nd->nl_cred, &vp)) != 0)
2402                 return (error);
2403         error = setutimes(vp, ts, tptr == NULL);
2404         vrele(vp);
2405         return (error);
2406 }
2407
2408 /*
2409  * utimes_args(char *path, struct timeval *tptr)
2410  *
2411  * Set the access and modification times of a file.
2412  */
2413 int
2414 utimes(struct utimes_args *uap)
2415 {
2416         struct timeval tv[2];
2417         struct nlookupdata nd;
2418         int error;
2419
2420         if (uap->tptr) {
2421                 error = copyin(uap->tptr, tv, sizeof(tv));
2422                 if (error)
2423                         return (error);
2424         }
2425         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2426         if (error == 0)
2427                 error = kern_utimes(&nd, uap->tptr ? tv : NULL);
2428         nlookup_done(&nd);
2429         return (error);
2430 }
2431
2432 /*
2433  * lutimes_args(char *path, struct timeval *tptr)
2434  *
2435  * Set the access and modification times of a file.
2436  */
2437 int
2438 lutimes(struct lutimes_args *uap)
2439 {
2440         struct timeval tv[2];
2441         struct nlookupdata nd;
2442         int error;
2443
2444         if (uap->tptr) {
2445                 error = copyin(uap->tptr, tv, sizeof(tv));
2446                 if (error)
2447                         return (error);
2448         }
2449         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2450         if (error == 0)
2451                 error = kern_utimes(&nd, uap->tptr ? tv : NULL);
2452         nlookup_done(&nd);
2453         return (error);
2454 }
2455
2456 int
2457 kern_futimes(int fd, struct timeval *tptr)
2458 {
2459         struct thread *td = curthread;
2460         struct proc *p = td->td_proc;
2461         struct timespec ts[2];
2462         struct file *fp;
2463         int error;
2464
2465         error = getutimes(tptr, ts);
2466         if (error)
2467                 return (error);
2468         error = getvnode(p->p_fd, fd, &fp);
2469         if (error)
2470                 return (error);
2471         error =  setutimes((struct vnode *)fp->f_data, ts, tptr == NULL);
2472         return (error);
2473 }
2474
2475 /*
2476  * futimes_args(int fd, struct timeval *tptr)
2477  *
2478  * Set the access and modification times of a file.
2479  */
2480 int
2481 futimes(struct futimes_args *uap)
2482 {
2483         struct timeval tv[2];
2484         int error;
2485
2486         if (uap->tptr) {
2487                 error = copyin(uap->tptr, tv, sizeof(tv));
2488                 if (error)
2489                         return (error);
2490         }
2491
2492         error = kern_futimes(uap->fd, uap->tptr ? tv : NULL);
2493
2494         return (error);
2495 }
2496
2497 int
2498 kern_truncate(struct nlookupdata *nd, off_t length)
2499 {
2500         struct vnode *vp;
2501         struct vattr vattr;
2502         int error;
2503
2504         if (length < 0)
2505                 return(EINVAL);
2506         /* XXX Add NLC flag indicating modifying operation? */
2507         if ((error = nlookup(nd)) != 0)
2508                 return (error);
2509         if ((error = cache_vref(nd->nl_ncp, nd->nl_cred, &vp)) != 0)
2510                 return (error);
2511         VOP_LEASE(vp, nd->nl_td, nd->nl_cred, LEASE_WRITE);
2512         if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, nd->nl_td)) != 0) {
2513                 vrele(vp);
2514                 return (error);
2515         }
2516         if (vp->v_type == VDIR) {
2517                 error = EISDIR;
2518         } else if ((error = vn_writechk(vp)) == 0 &&
2519             (error = VOP_ACCESS(vp, VWRITE, nd->nl_cred, nd->nl_td)) == 0) {
2520                 VATTR_NULL(&vattr);
2521                 vattr.va_size = length;
2522                 error = VOP_SETATTR(vp, &vattr, nd->nl_cred, nd->nl_td);
2523         }
2524         vput(vp);
2525         return (error);
2526 }
2527
2528 /*
2529  * truncate(char *path, int pad, off_t length)
2530  *
2531  * Truncate a file given its path name.
2532  */
2533 int
2534 truncate(struct truncate_args *uap)
2535 {
2536         struct nlookupdata nd;
2537         int error;
2538
2539         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2540         if (error == 0)
2541                 error = kern_truncate(&nd, uap->length);
2542         nlookup_done(&nd);
2543         return error;
2544 }
2545
2546 int
2547 kern_ftruncate(int fd, off_t length)
2548 {
2549         struct thread *td = curthread;
2550         struct proc *p = td->td_proc;
2551         struct vattr vattr;
2552         struct vnode *vp;
2553         struct file *fp;
2554         int error;
2555
2556         if (length < 0)
2557                 return(EINVAL);
2558         if ((error = getvnode(p->p_fd, fd, &fp)) != 0)
2559                 return (error);
2560         if ((fp->f_flag & FWRITE) == 0)
2561                 return (EINVAL);
2562         vp = (struct vnode *)fp->f_data;
2563         VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE);
2564         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
2565         if (vp->v_type == VDIR)
2566                 error = EISDIR;
2567         else if ((error = vn_writechk(vp)) == 0) {
2568                 VATTR_NULL(&vattr);
2569                 vattr.va_size = length;
2570                 error = VOP_SETATTR(vp, &vattr, fp->f_cred, td);
2571         }
2572         VOP_UNLOCK(vp, 0, td);
2573         return (error);
2574 }
2575
2576 /*
2577  * ftruncate_args(int fd, int pad, off_t length)
2578  *
2579  * Truncate a file given a file descriptor.
2580  */
2581 int
2582 ftruncate(struct ftruncate_args *uap)
2583 {
2584         int error;
2585
2586         error = kern_ftruncate(uap->fd, uap->length);
2587
2588         return (error);
2589 }
2590
2591 /*
2592  * fsync(int fd)
2593  *
2594  * Sync an open file.
2595  */
2596 /* ARGSUSED */
2597 int
2598 fsync(struct fsync_args *uap)
2599 {
2600         struct thread *td = curthread;
2601         struct proc *p = td->td_proc;
2602         struct vnode *vp;
2603         struct file *fp;
2604         vm_object_t obj;
2605         int error;
2606
2607         if ((error = getvnode(p->p_fd, uap->fd, &fp)) != 0)
2608                 return (error);
2609         vp = (struct vnode *)fp->f_data;
2610         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
2611         if (VOP_GETVOBJECT(vp, &obj) == 0)
2612                 vm_object_page_clean(obj, 0, 0, 0);
2613         if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) == 0 &&
2614             vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP) &&
2615             bioops.io_fsync)
2616                 error = (*bioops.io_fsync)(vp);
2617         VOP_UNLOCK(vp, 0, td);
2618         return (error);
2619 }
2620
2621 int
2622 kern_rename(struct nlookupdata *fromnd, struct nlookupdata *tond)
2623 {
2624         struct namecache *fncpd;
2625         struct namecache *tncpd;
2626         struct namecache *ncp;
2627         struct mount *mp;
2628         int error;
2629
2630         bwillwrite();
2631         if ((error = nlookup(fromnd)) != 0)
2632                 return (error);
2633         if ((fncpd = fromnd->nl_ncp->nc_parent) == NULL)
2634                 return (ENOENT);
2635         cache_hold(fncpd);
2636
2637         /*
2638          * unlock the source ncp so we can lookup the target ncp without
2639          * deadlocking.  The target may or may not exist so we do not check
2640          * for a target vp like kern_mkdir() and other creation functions do.
2641          *
2642          * The source and target directories are ref'd and rechecked after
2643          * everything is relocked to determine if the source or target file
2644          * has been renamed.
2645          */
2646         KKASSERT(fromnd->nl_flags & NLC_NCPISLOCKED);
2647         fromnd->nl_flags &= ~NLC_NCPISLOCKED;
2648         cache_unlock(fromnd->nl_ncp);
2649
2650         tond->nl_flags |= NLC_CREATE;
2651         if ((error = nlookup(tond)) != 0) {
2652                 cache_drop(fncpd);
2653                 return (error);
2654         }
2655         if ((tncpd = tond->nl_ncp->nc_parent) == NULL) {
2656                 cache_drop(fncpd);
2657                 return (ENOENT);
2658         }
2659         cache_hold(tncpd);
2660
2661         /*
2662          * If the source and target are the same there is nothing to do
2663          */
2664         if (fromnd->nl_ncp == tond->nl_ncp) {
2665                 cache_drop(fncpd);
2666                 cache_drop(tncpd);
2667                 return (0);
2668         }
2669
2670         /*
2671          * relock the source ncp.  NOTE AFTER RELOCKING: the source ncp
2672          * may have become invalid while it was unlocked, nc_vp and nc_mount
2673          * could be NULL.
2674          */
2675         if (cache_lock_nonblock(fromnd->nl_ncp) == 0) {
2676                 cache_resolve(fromnd->nl_ncp, fromnd->nl_cred);
2677         } else if (fromnd->nl_ncp > tond->nl_ncp) {
2678                 cache_lock(fromnd->nl_ncp);
2679                 cache_resolve(fromnd->nl_ncp, fromnd->nl_cred);
2680         } else {
2681                 cache_unlock(tond->nl_ncp);
2682                 cache_lock(fromnd->nl_ncp);
2683                 cache_resolve(fromnd->nl_ncp, fromnd->nl_cred);
2684                 cache_lock(tond->nl_ncp);
2685                 cache_resolve(tond->nl_ncp, tond->nl_cred);
2686         }
2687         fromnd->nl_flags |= NLC_NCPISLOCKED;
2688
2689         /*
2690          * make sure the parent directories linkages are the same
2691          */
2692         if (fncpd != fromnd->nl_ncp->nc_parent ||
2693             tncpd != tond->nl_ncp->nc_parent) {
2694                 cache_drop(fncpd);
2695                 cache_drop(tncpd);
2696                 return (ENOENT);
2697         }
2698
2699         /*
2700          * Both the source and target must be within the same filesystem and
2701          * in the same filesystem as their parent directories within the
2702          * namecache topology.
2703          *
2704          * NOTE: fromnd's nc_mount or nc_vp could be NULL.
2705          */
2706         mp = fncpd->nc_mount;
2707         if (mp != tncpd->nc_mount || mp != fromnd->nl_ncp->nc_mount ||
2708             mp != tond->nl_ncp->nc_mount) {
2709                 cache_drop(fncpd);
2710                 cache_drop(tncpd);
2711                 return (EXDEV);
2712         }
2713
2714         /*
2715          * If the target exists and either the source or target is a directory,
2716          * then both must be directories.
2717          *
2718          * Due to relocking of the source, fromnd->nl_ncp->nc_vp might have
2719          * become NULL.
2720          */
2721         if (tond->nl_ncp->nc_vp) {
2722                 if (fromnd->nl_ncp->nc_vp == NULL) {
2723                         error = ENOENT;
2724                 } else if (fromnd->nl_ncp->nc_vp->v_type == VDIR) {
2725                         if (tond->nl_ncp->nc_vp->v_type != VDIR)
2726                                 error = ENOTDIR;
2727                 } else if (tond->nl_ncp->nc_vp->v_type == VDIR) {
2728                         error = EISDIR;
2729                 }
2730         }
2731
2732         /*
2733          * You cannot rename a source into itself or a subdirectory of itself.
2734          * We check this by travsersing the target directory upwards looking
2735          * for a match against the source.
2736          */
2737         if (error == 0) {
2738                 for (ncp = tncpd; ncp; ncp = ncp->nc_parent) {
2739                         if (fromnd->nl_ncp == ncp) {
2740                                 error = EINVAL;
2741                                 break;
2742                         }
2743                 }
2744         }
2745
2746         cache_drop(fncpd);
2747         cache_drop(tncpd);
2748
2749         /*
2750          * Even though the namespaces are different, they may still represent
2751          * hardlinks to the same file.  The filesystem might have a hard time
2752          * with this so we issue a NREMOVE of the source instead of a NRENAME
2753          * when we detect the situation.
2754          */
2755         if (error == 0) {
2756                 if (fromnd->nl_ncp->nc_vp == tond->nl_ncp->nc_vp) {
2757                         error = VOP_NREMOVE(fromnd->nl_ncp, fromnd->nl_cred);
2758                 } else {
2759                         error = VOP_NRENAME(fromnd->nl_ncp, tond->nl_ncp,
2760                                             tond->nl_cred);
2761                 }
2762         }
2763         return (error);
2764 }
2765
2766 /*
2767  * rename_args(char *from, char *to)
2768  *
2769  * Rename files.  Source and destination must either both be directories,
2770  * or both not be directories.  If target is a directory, it must be empty.
2771  */
2772 int
2773 rename(struct rename_args *uap)
2774 {
2775         struct nlookupdata fromnd, tond;
2776         int error;
2777
2778         error = nlookup_init(&fromnd, uap->from, UIO_USERSPACE, 0);
2779         if (error == 0) {
2780                 error = nlookup_init(&tond, uap->to, UIO_USERSPACE, 0);
2781                 if (error == 0)
2782                         error = kern_rename(&fromnd, &tond);
2783                 nlookup_done(&tond);
2784         }
2785         nlookup_done(&fromnd);
2786         return (error);
2787 }
2788
2789 int
2790 kern_mkdir(struct nlookupdata *nd, int mode)
2791 {
2792         struct thread *td = curthread;
2793         struct proc *p = td->td_proc;
2794         struct namecache *ncp;
2795         struct vnode *vp;
2796         struct vattr vattr;
2797         int error;
2798
2799         bwillwrite();
2800         nd->nl_flags |= NLC_WILLBEDIR | NLC_CREATE;
2801         if ((error = nlookup(nd)) != 0)
2802                 return (error);
2803
2804         ncp = nd->nl_ncp;
2805         if (ncp->nc_vp)
2806                 return (EEXIST);
2807
2808         VATTR_NULL(&vattr);
2809         vattr.va_type = VDIR;
2810         vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask;
2811
2812         vp = NULL;
2813         error = VOP_NMKDIR(ncp, &vp, p->p_ucred, &vattr);
2814         if (error == 0)
2815                 vput(vp);
2816         return (error);
2817 }
2818
2819 /*
2820  * mkdir_args(char *path, int mode)
2821  *
2822  * Make a directory file.
2823  */
2824 /* ARGSUSED */
2825 int
2826 mkdir(struct mkdir_args *uap)
2827 {
2828         struct nlookupdata nd;
2829         int error;
2830
2831         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2832         if (error == 0)
2833                 error = kern_mkdir(&nd, uap->mode);
2834         nlookup_done(&nd);
2835         return (error);
2836 }
2837
2838 int
2839 kern_rmdir(struct nlookupdata *nd)
2840 {
2841         struct namecache *ncp;
2842         int error;
2843
2844         bwillwrite();
2845         nd->nl_flags |= NLC_DELETE;
2846         if ((error = nlookup(nd)) != 0)
2847                 return (error);
2848
2849         ncp = nd->nl_ncp;
2850         error = VOP_NRMDIR(ncp, nd->nl_cred);
2851         return (error);
2852 }
2853
2854 /*
2855  * rmdir_args(char *path)
2856  *
2857  * Remove a directory file.
2858  */
2859 /* ARGSUSED */
2860 int
2861 rmdir(struct rmdir_args *uap)
2862 {
2863         struct nlookupdata nd;
2864         int error;
2865
2866         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2867         if (error == 0)
2868                 error = kern_rmdir(&nd);
2869         nlookup_done(&nd);
2870         return (error);
2871 }
2872
2873 int
2874 kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res,
2875     enum uio_seg direction)
2876 {
2877         struct thread *td = curthread;
2878         struct proc *p = td->td_proc;
2879         struct vnode *vp;
2880         struct file *fp;
2881         struct uio auio;
2882         struct iovec aiov;
2883         long loff;
2884         int error, eofflag;
2885
2886         if ((error = getvnode(p->p_fd, fd, &fp)) != 0)
2887                 return (error);
2888         if ((fp->f_flag & FREAD) == 0)
2889                 return (EBADF);
2890         vp = (struct vnode *)fp->f_data;
2891 unionread:
2892         if (vp->v_type != VDIR)
2893                 return (EINVAL);
2894         aiov.iov_base = buf;
2895         aiov.iov_len = count;
2896         auio.uio_iov = &aiov;
2897         auio.uio_iovcnt = 1;
2898         auio.uio_rw = UIO_READ;
2899         auio.uio_segflg = direction;
2900         auio.uio_td = td;
2901         auio.uio_resid = count;
2902         /* vn_lock(vp, LK_SHARED | LK_RETRY, td); */
2903         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
2904         loff = auio.uio_offset = fp->f_offset;
2905         error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
2906         fp->f_offset = auio.uio_offset;
2907         VOP_UNLOCK(vp, 0, td);
2908         if (error)
2909                 return (error);
2910         if (count == auio.uio_resid) {
2911                 if (union_dircheckp) {
2912                         error = union_dircheckp(td, &vp, fp);
2913                         if (error == -1)
2914                                 goto unionread;
2915                         if (error)
2916                                 return (error);
2917                 }
2918                 if ((vp->v_flag & VROOT) &&
2919                     (vp->v_mount->mnt_flag & MNT_UNION)) {
2920                         struct vnode *tvp = vp;
2921                         vp = vp->v_mount->mnt_vnodecovered;
2922                         vref(vp);
2923                         fp->f_data = vp;
2924                         fp->f_offset = 0;
2925                         vrele(tvp);
2926                         goto unionread;
2927                 }
2928         }
2929         if (basep) {
2930                 *basep = loff;
2931         }
2932         *res = count - auio.uio_resid;
2933         return (error);
2934 }
2935
2936 /*
2937  * getdirentries_args(int fd, char *buf, u_int conut, long *basep)
2938  *
2939  * Read a block of directory entries in a file system independent format.
2940  */
2941 int
2942 getdirentries(struct getdirentries_args *uap)
2943 {
2944         long base;
2945         int error;
2946
2947         error = kern_getdirentries(uap->fd, uap->buf, uap->count, &base,
2948             &uap->sysmsg_result, UIO_USERSPACE);
2949
2950         if (error == 0)
2951                 error = copyout(&base, uap->basep, sizeof(*uap->basep));
2952         return (error);
2953 }
2954
2955 /*
2956  * getdents_args(int fd, char *buf, size_t count)
2957  */
2958 int
2959 getdents(struct getdents_args *uap)
2960 {
2961         int error;
2962
2963         error = kern_getdirentries(uap->fd, uap->buf, uap->count, NULL,
2964             &uap->sysmsg_result, UIO_USERSPACE);
2965
2966         return (error);
2967 }
2968
2969 /*
2970  * umask(int newmask)
2971  *
2972  * Set the mode mask for creation of filesystem nodes.
2973  *
2974  * MP SAFE
2975  */
2976 int
2977 umask(struct umask_args *uap)
2978 {
2979         struct thread *td = curthread;
2980         struct proc *p = td->td_proc;
2981         struct filedesc *fdp;
2982
2983         fdp = p->p_fd;
2984         uap->sysmsg_result = fdp->fd_cmask;
2985         fdp->fd_cmask = uap->newmask & ALLPERMS;
2986         return (0);
2987 }
2988
2989 /*
2990  * revoke(char *path)
2991  *
2992  * Void all references to file by ripping underlying filesystem
2993  * away from vnode.
2994  */
2995 /* ARGSUSED */
2996 int
2997 revoke(struct revoke_args *uap)
2998 {
2999         struct thread *td = curthread;
3000         struct nlookupdata nd;
3001         struct vattr vattr;
3002         struct vnode *vp;
3003         struct ucred *cred;
3004         int error;
3005
3006         vp = NULL;
3007         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3008         if (error == 0)
3009                 error = nlookup(&nd);
3010         if (error == 0)
3011                 error = cache_vref(nd.nl_ncp, nd.nl_cred, &vp);
3012         cred = crhold(nd.nl_cred);
3013         nlookup_done(&nd);
3014         if (error == 0) {
3015                 if (vp->v_type != VCHR && vp->v_type != VBLK)
3016                         error = EINVAL;
3017                 if (error == 0)
3018                         error = VOP_GETATTR(vp, &vattr, td);
3019                 if (error == 0 && cred->cr_uid != vattr.va_uid)
3020                         error = suser_cred(cred, PRISON_ROOT);
3021                 if (error == 0 && count_udev(vp->v_udev) > 0) {
3022                         if ((error = vx_lock(vp)) == 0) {
3023                                 VOP_REVOKE(vp, REVOKEALL);
3024                                 vx_unlock(vp);
3025                         }
3026                 }
3027                 vrele(vp);
3028         }
3029         if (cred)
3030                 crfree(cred);
3031         return (error);
3032 }
3033
3034 /*
3035  * Convert a user file descriptor to a kernel file entry.
3036  */
3037 int
3038 getvnode(struct filedesc *fdp, int fd, struct file **fpp)
3039 {
3040         struct file *fp;
3041
3042         if ((u_int)fd >= fdp->fd_nfiles ||
3043             (fp = fdp->fd_files[fd].fp) == NULL)
3044                 return (EBADF);
3045         if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO)
3046                 return (EINVAL);
3047         *fpp = fp;
3048         return (0);
3049 }
3050 /*
3051  * getfh_args(char *fname, fhandle_t *fhp)
3052  *
3053  * Get (NFS) file handle
3054  */
3055 int
3056 getfh(struct getfh_args *uap)
3057 {
3058         struct thread *td = curthread;
3059         struct nlookupdata nd;
3060         fhandle_t fh;
3061         struct vnode *vp;
3062         int error;
3063
3064         /*
3065          * Must be super user
3066          */
3067         if ((error = suser(td)) != 0)
3068                 return (error);
3069
3070         vp = NULL;
3071         error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
3072         if (error == 0)
3073                 error = nlookup(&nd);
3074         if (error == 0)
3075                 error = cache_vget(nd.nl_ncp, nd.nl_cred, LK_EXCLUSIVE, &vp);
3076         nlookup_done(&nd);
3077         if (error == 0) {
3078                 bzero(&fh, sizeof(fh));
3079                 fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
3080                 error = VFS_VPTOFH(vp, &fh.fh_fid);
3081                 vput(vp);
3082                 if (error == 0)
3083                         error = copyout(&fh, uap->fhp, sizeof(fh));
3084         }
3085         return (error);
3086 }
3087
3088 /*
3089  * fhopen_args(const struct fhandle *u_fhp, int flags)
3090  *
3091  * syscall for the rpc.lockd to use to translate a NFS file handle into
3092  * an open descriptor.
3093  *
3094  * warning: do not remove the suser() call or this becomes one giant
3095  * security hole.
3096  */
3097 int
3098 fhopen(struct fhopen_args *uap)
3099 {
3100         struct thread *td = curthread;
3101         struct proc *p = td->td_proc;
3102         struct mount *mp;
3103         struct vnode *vp;
3104         struct fhandle fhp;
3105         struct vattr vat;
3106         struct vattr *vap = &vat;
3107         struct flock lf;
3108         struct filedesc *fdp = p->p_fd;
3109         int fmode, mode, error, type;
3110         struct file *nfp;
3111         struct file *fp;
3112         int indx;
3113
3114         /*
3115          * Must be super user
3116          */
3117         error = suser(td);
3118         if (error)
3119                 return (error);
3120
3121         fmode = FFLAGS(uap->flags);
3122         /* why not allow a non-read/write open for our lockd? */
3123         if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
3124                 return (EINVAL);
3125         error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
3126         if (error)
3127                 return(error);
3128         /* find the mount point */
3129         mp = vfs_getvfs(&fhp.fh_fsid);
3130         if (mp == NULL)
3131                 return (ESTALE);
3132         /* now give me my vnode, it gets returned to me locked */
3133         error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
3134         if (error)
3135                 return (error);
3136         /*
3137          * from now on we have to make sure not
3138          * to forget about the vnode
3139          * any error that causes an abort must vput(vp)
3140          * just set error = err and 'goto bad;'.
3141          */
3142
3143         /*
3144          * from vn_open
3145          */
3146         if (vp->v_type == VLNK) {
3147                 error = EMLINK;
3148                 goto bad;
3149         }
3150         if (vp->v_type == VSOCK) {
3151                 error = EOPNOTSUPP;
3152                 goto bad;
3153         }
3154         mode = 0;
3155         if (fmode & (FWRITE | O_TRUNC)) {
3156                 if (vp->v_type == VDIR) {
3157                         error = EISDIR;
3158                         goto bad;
3159                 }
3160                 error = vn_writechk(vp);
3161                 if (error)
3162                         goto bad;
3163                 mode |= VWRITE;
3164         }
3165         if (fmode & FREAD)
3166                 mode |= VREAD;
3167         if (mode) {
3168                 error = VOP_ACCESS(vp, mode, p->p_ucred, td);
3169                 if (error)
3170                         goto bad;
3171         }
3172         if (fmode & O_TRUNC) {
3173                 VOP_UNLOCK(vp, 0, td);                  /* XXX */
3174                 VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE);
3175                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);       /* XXX */
3176                 VATTR_NULL(vap);
3177                 vap->va_size = 0;
3178                 error = VOP_SETATTR(vp, vap, p->p_ucred, td);
3179                 if (error)
3180                         goto bad;
3181         }
3182
3183         /*
3184          * VOP_OPEN needs the file pointer so it can potentially override
3185          * it.
3186          *
3187          * WARNING! no f_ncp will be associated when fhopen()ing a directory.
3188          * XXX
3189          */
3190         if ((error = falloc(p, &nfp, NULL)) != 0)
3191                 goto bad;
3192         fp = nfp;
3193
3194         fp->f_type = DTYPE_VNODE;
3195         fp->f_flag = fmode & FMASK;
3196         fp->f_ops = &vnode_fileops;
3197         fp->f_data = vp;
3198
3199         error = VOP_OPEN(vp, fmode, p->p_ucred, fp, td);
3200         if (error) {
3201                 /*
3202                  * setting f_ops this way prevents VOP_CLOSE from being
3203                  * called or fdrop() releasing the vp from v_data.   Since
3204                  * the VOP_OPEN failed we don't want to VOP_CLOSE.
3205                  */
3206                 fp->f_ops = &badfileops;
3207                 fp->f_data = NULL;
3208                 fdrop(fp, td);
3209                 goto bad;
3210         }
3211         if (fmode & FWRITE)
3212                 vp->v_writecount++;
3213
3214         /*
3215          * The fp now owns a reference on the vnode.  We still have our own
3216          * ref+lock.
3217          */
3218         vref(vp);
3219
3220         /*
3221          * Make sure that a VM object is created for VMIO support.  If this
3222          * fails just fdrop() normally to clean up.
3223          */
3224         if (vn_canvmio(vp) == TRUE) {
3225                 if ((error = vfs_object_create(vp, td)) != 0) {
3226                         fdrop(fp, td);
3227                         goto bad;
3228                 }
3229         }
3230
3231         /*
3232          * The open was successful, associate it with a file descriptor.
3233          */
3234         if ((error = fsetfd(p, fp, &indx)) != 0) {
3235                 if (fmode & FWRITE)
3236                         vp->v_writecount--;
3237                 fdrop(fp, td);
3238                 goto bad;
3239         }
3240
3241         if (fmode & (O_EXLOCK | O_SHLOCK)) {
3242                 lf.l_whence = SEEK_SET;
3243                 lf.l_start = 0;
3244                 lf.l_len = 0;
3245                 if (fmode & O_EXLOCK)
3246                         lf.l_type = F_WRLCK;
3247                 else
3248                         lf.l_type = F_RDLCK;
3249                 type = F_FLOCK;
3250                 if ((fmode & FNONBLOCK) == 0)
3251                         type |= F_WAIT;
3252                 VOP_UNLOCK(vp, 0, td);
3253                 if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
3254                         /*
3255                          * lock request failed.  Normally close the descriptor
3256                          * but handle the case where someone might have dup()d
3257                          * or close()d it when we weren't looking.
3258                          */
3259                         if (fdp->fd_files[indx].fp == fp) {
3260                                 funsetfd(fdp, indx);
3261                                 fdrop(fp, td);
3262                         }
3263
3264                         /*
3265                          * release our private reference.
3266                          */
3267                         fdrop(fp, td);
3268                         vrele(vp);
3269                         return (error);
3270                 }
3271                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
3272                 fp->f_flag |= FHASLOCK;
3273         }
3274         if ((vp->v_type == VREG) && (VOP_GETVOBJECT(vp, NULL) != 0))
3275                 vfs_object_create(vp, td);
3276
3277         vput(vp);
3278         fdrop(fp, td);
3279         uap->sysmsg_result = indx;
3280         return (0);
3281
3282 bad:
3283         vput(vp);
3284         return (error);
3285 }
3286
3287 /*
3288  * fhstat_args(struct fhandle *u_fhp, struct stat *sb)
3289  */
3290 int
3291 fhstat(struct fhstat_args *uap)
3292 {
3293         struct thread *td = curthread;
3294         struct stat sb;
3295         fhandle_t fh;
3296         struct mount *mp;
3297         struct vnode *vp;
3298         int error;
3299
3300         /*
3301          * Must be super user
3302          */
3303         error = suser(td);
3304         if (error)
3305                 return (error);
3306
3307         error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
3308         if (error)
3309                 return (error);
3310
3311         if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
3312                 return (ESTALE);
3313         if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
3314                 return (error);
3315         error = vn_stat(vp, &sb, td);
3316         vput(vp);
3317         if (error)
3318                 return (error);
3319         error = copyout(&sb, uap->sb, sizeof(sb));
3320         return (error);
3321 }
3322
3323 /*
3324  * fhstatfs_args(struct fhandle *u_fhp, struct statfs *buf)
3325  */
3326 int
3327 fhstatfs(struct fhstatfs_args *uap)
3328 {
3329         struct thread *td = curthread;
3330         struct proc *p = td->td_proc;
3331         struct statfs *sp;
3332         struct mount *mp;
3333         struct vnode *vp;
3334         struct statfs sb;
3335         char *fullpath, *freepath;
3336         fhandle_t fh;
3337         int error;
3338
3339         /*
3340          * Must be super user
3341          */
3342         if ((error = suser(td)))
3343                 return (error);
3344
3345         if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
3346                 return (error);
3347
3348         if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
3349                 return (ESTALE);
3350
3351         if (p != NULL && (p->p_fd->fd_nrdir->nc_flag & NCF_ROOT) == 0 &&
3352             !chroot_visible_mnt(mp, p))
3353                 return (ESTALE);
3354
3355         if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
3356                 return (error);
3357         mp = vp->v_mount;
3358         sp = &mp->mnt_stat;
3359         vput(vp);
3360         if ((error = VFS_STATFS(mp, sp, td)) != 0)
3361                 return (error);
3362
3363         error = cache_fullpath(p, mp->mnt_ncp, &fullpath, &freepath);
3364         if (error)
3365                 return(error);
3366         bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3367         strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
3368         free(freepath, M_TEMP);
3369
3370         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3371         if (suser(td)) {
3372                 bcopy(sp, &sb, sizeof(sb));
3373                 sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
3374                 sp = &sb;
3375         }
3376         return (copyout(sp, uap->buf, sizeof(*sp)));
3377 }
3378
3379 /*
3380  * Syscall to push extended attribute configuration information into the
3381  * VFS.  Accepts a path, which it converts to a mountpoint, as well as
3382  * a command (int cmd), and attribute name and misc data.  For now, the
3383  * attribute name is left in userspace for consumption by the VFS_op.
3384  * It will probably be changed to be copied into sysspace by the
3385  * syscall in the future, once issues with various consumers of the
3386  * attribute code have raised their hands.
3387  *
3388  * Currently this is used only by UFS Extended Attributes.
3389  */
3390 int
3391 extattrctl(struct extattrctl_args *uap)
3392 {
3393         struct nlookupdata nd;
3394         struct mount *mp;
3395         struct vnode *vp;
3396         int error;
3397
3398         vp = NULL;
3399         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3400         if (error == 0)
3401                 error = nlookup(&nd);
3402         if (error == 0) {
3403                 mp = nd.nl_ncp->nc_mount;
3404                 error = VFS_EXTATTRCTL(mp, uap->cmd,
3405                                 uap->attrname, uap->arg,
3406                                 nd.nl_td);
3407         }
3408         nlookup_done(&nd);
3409         return (error);
3410 }
3411
3412 /*
3413  * Syscall to set a named extended attribute on a file or directory.
3414  * Accepts attribute name, and a uio structure pointing to the data to set.
3415  * The uio is consumed in the style of writev().  The real work happens
3416  * in VOP_SETEXTATTR().
3417  */
3418 int
3419 extattr_set_file(struct extattr_set_file_args *uap)
3420 {
3421         char attrname[EXTATTR_MAXNAMELEN];
3422         struct iovec aiov[UIO_SMALLIOV];
3423         struct iovec *needfree;
3424         struct nlookupdata nd;
3425         struct iovec *iov;
3426         struct vnode *vp;
3427         struct uio auio;
3428         u_int iovlen;
3429         u_int cnt;
3430         int error;
3431         int i;
3432
3433         error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
3434         if (error)
3435                 return (error);
3436
3437         vp = NULL;
3438         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3439         if (error == 0)
3440                 error = nlookup(&nd);
3441         if (error == 0)
3442                 error = cache_vget(nd.nl_ncp, nd.nl_cred, LK_EXCLUSIVE, &vp);
3443         if (error) {
3444                 nlookup_done(&nd);
3445                 return (error);
3446         }
3447
3448         needfree = NULL;
3449         iovlen = uap->iovcnt * sizeof(struct iovec);
3450         if (uap->iovcnt > UIO_SMALLIOV) {
3451                 if (uap->iovcnt > UIO_MAXIOV) {
3452                         error = EINVAL;
3453                         goto done;
3454                 }
3455                 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
3456                 needfree = iov;
3457         } else {
3458                 iov = aiov;
3459         }
3460         auio.uio_iov = iov;
3461         auio.uio_iovcnt = uap->iovcnt;
3462         auio.uio_rw = UIO_WRITE;
3463         auio.uio_segflg = UIO_USERSPACE;
3464         auio.uio_td = nd.nl_td;
3465         auio.uio_offset = 0;
3466         if ((error = copyin(uap->iovp, iov, iovlen)))
3467                 goto done;
3468         auio.uio_resid = 0;
3469         for (i = 0; i < uap->iovcnt; i++) {
3470                 if (iov->iov_len > INT_MAX - auio.uio_resid) {
3471                         error = EINVAL;
3472                         goto done;
3473                 }
3474                 auio.uio_resid += iov->iov_len;
3475                 iov++;
3476         }
3477         cnt = auio.uio_resid;
3478         error = VOP_SETEXTATTR(vp, attrname, &auio, nd.nl_cred, nd.nl_td);
3479         cnt -= auio.uio_resid;
3480         uap->sysmsg_result = cnt;
3481 done:
3482         vput(vp);
3483         nlookup_done(&nd);
3484         if (needfree)
3485                 FREE(needfree, M_IOV);
3486         return (error);
3487 }
3488
3489 /*
3490  * Syscall to get a named extended attribute on a file or directory.
3491  * Accepts attribute name, and a uio structure pointing to a buffer for the
3492  * data.  The uio is consumed in the style of readv().  The real work
3493  * happens in VOP_GETEXTATTR();
3494  */
3495 int
3496 extattr_get_file(struct extattr_get_file_args *uap)
3497 {
3498         char attrname[EXTATTR_MAXNAMELEN];
3499         struct iovec aiov[UIO_SMALLIOV];
3500         struct iovec *needfree;
3501         struct nlookupdata nd;
3502         struct iovec *iov;
3503         struct vnode *vp;
3504         struct uio auio;
3505         u_int iovlen;
3506         u_int cnt;
3507         int error;
3508         int i;
3509
3510         error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
3511         if (error)
3512                 return (error);
3513
3514         vp = NULL;
3515         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3516         if (error == 0)
3517                 error = nlookup(&nd);
3518         if (error == 0)
3519                 error = cache_vget(nd.nl_ncp, nd.nl_cred, LK_EXCLUSIVE, &vp);
3520         if (error) {
3521                 nlookup_done(&nd);
3522                 return (error);
3523         }
3524
3525         iovlen = uap->iovcnt * sizeof (struct iovec);
3526         needfree = NULL;
3527         if (uap->iovcnt > UIO_SMALLIOV) {
3528                 if (uap->iovcnt > UIO_MAXIOV) {
3529                         error = EINVAL;
3530                         goto done;
3531                 }
3532                 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
3533                 needfree = iov;
3534         } else {
3535                 iov = aiov;
3536         }
3537         auio.uio_iov = iov;
3538         auio.uio_iovcnt = uap->iovcnt;
3539         auio.uio_rw = UIO_READ;
3540         auio.uio_segflg = UIO_USERSPACE;
3541         auio.uio_td = nd.nl_td;
3542         auio.uio_offset = 0;
3543         if ((error = copyin(uap->iovp, iov, iovlen)))
3544                 goto done;
3545         auio.uio_resid = 0;
3546         for (i = 0; i < uap->iovcnt; i++) {
3547                 if (iov->iov_len > INT_MAX - auio.uio_resid) {
3548                         error = EINVAL;
3549                         goto done;
3550                 }
3551                 auio.uio_resid += iov->iov_len;
3552                 iov++;
3553         }
3554         cnt = auio.uio_resid;
3555         error = VOP_GETEXTATTR(vp, attrname, &auio, nd.nl_cred, nd.nl_td);
3556         cnt -= auio.uio_resid;
3557         uap->sysmsg_result = cnt;
3558 done:
3559         vput(vp);
3560         nlookup_done(&nd);
3561         if (needfree)
3562                 FREE(needfree, M_IOV);
3563         return(error);
3564 }
3565
3566 /*
3567  * Syscall to delete a named extended attribute from a file or directory.
3568  * Accepts attribute name.  The real work happens in VOP_SETEXTATTR().
3569  */
3570 int
3571 extattr_delete_file(struct extattr_delete_file_args *uap)
3572 {
3573         char attrname[EXTATTR_MAXNAMELEN];
3574         struct nlookupdata nd;
3575         struct vnode *vp;
3576         int error;
3577
3578         error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
3579         if (error)
3580                 return(error);
3581
3582         vp = NULL;
3583         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3584         if (error == 0)
3585                 error = nlookup(&nd);
3586         if (error == 0)
3587                 error = cache_vget(nd.nl_ncp, nd.nl_cred, LK_EXCLUSIVE, &vp);
3588         if (error) {
3589                 nlookup_done(&nd);
3590                 return (error);
3591         }
3592
3593         error = VOP_SETEXTATTR(vp, attrname, NULL, nd.nl_cred, nd.nl_td);
3594         vput(vp);
3595         nlookup_done(&nd);
3596         return(error);
3597 }
3598
3599 static int
3600 chroot_visible_mnt(struct mount *mp, struct proc *p)
3601 {
3602         struct namecache *ncp;
3603         /*
3604          * First check if this file system is below
3605          * the chroot path.
3606          */
3607         ncp = mp->mnt_ncp;
3608         while (ncp != NULL && ncp != p->p_fd->fd_nrdir)
3609                 ncp = ncp->nc_parent;
3610         if (ncp == NULL) {
3611                 /*
3612                  * This is not below the chroot path.
3613                  *
3614                  * Check if the chroot path is on the same filesystem,
3615                  * by determing if we have to cross a mount point
3616                  * before reaching mp->mnt_ncp.
3617                  */
3618                 ncp = p->p_fd->fd_nrdir;
3619                 while (ncp != NULL && ncp != mp->mnt_ncp) {
3620                         if (ncp->nc_flag & NCF_MOUNTPT) {
3621                                 ncp = NULL;
3622                                 break;
3623                         }
3624                         ncp = ncp->nc_parent;
3625                 }
3626         }
3627         return(ncp != NULL);
3628 }