sys/kern/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)vfs_syscalls.c      8.13 (Berkeley) 4/15/94
  39  * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $
  40  * $DragonFly: src/sys/kern/vfs_syscalls.c,v 1.135 2008/11/11 00:55:49 pavalos Exp $
  41  */
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/buf.h>
  46 #include <sys/conf.h>
  47 #include <sys/sysent.h>
  48 #include <sys/malloc.h>
  49 #include <sys/mount.h>
  50 #include <sys/mountctl.h>
  51 #include <sys/sysproto.h>
  52 #include <sys/filedesc.h>
  53 #include <sys/kernel.h>
  54 #include <sys/fcntl.h>
  55 #include <sys/file.h>
  56 #include <sys/linker.h>
  57 #include <sys/stat.h>
  58 #include <sys/unistd.h>
  59 #include <sys/vnode.h>
  60 #include <sys/proc.h>
  61 #include <sys/priv.h>
  62 #include <sys/jail.h>
  63 #include <sys/namei.h>
  64 #include <sys/nlookup.h>
  65 #include <sys/dirent.h>
  66 #include <sys/extattr.h>
  67 #include <sys/spinlock.h>
  68 #include <sys/kern_syscall.h>
  69 #include <sys/objcache.h>
  70 #include <sys/sysctl.h>
  71
  72 #include <sys/buf2.h>
  73 #include <sys/file2.h>
  74 #include <sys/spinlock2.h>
  75
  76 #include <vm/vm.h>
  77 #include <vm/vm_object.h>
  78 #include <vm/vm_page.h>
  79
  80 #include <machine/limits.h>
  81 #include <machine/stdarg.h>
  82
  83 #include <vfs/union/union.h>
  84
  85 static void mount_warning(struct mount *mp, const char *ctl, ...);
  86 static int mount_path(struct proc *p, struct mount *mp, char **rb, char **fb);
  87 static int checkvp_chdir (struct vnode *vn, struct thread *td);
  88 static void checkdirs (struct nchandle *old_nch, struct nchandle *new_nch);
  89 static int chroot_refuse_vdir_fds (struct filedesc *fdp);
  90 static int chroot_visible_mnt(struct mount *mp, struct proc *p);
  91 static int getutimes (const struct timeval *, struct timespec *);
  92 static int setfown (struct vnode *, uid_t, gid_t);
  93 static int setfmode (struct vnode *, int);
  94 static int setfflags (struct vnode *, int);
  95 static int setutimes (struct vnode *, struct vattr *,
  96                         const struct timespec *, int);
  97 static int      usermount = 0;  /* if 1, non-root can mount fs. */
  98
  99 int (*union_dircheckp) (struct thread *, struct vnode **, struct file *);
 100
 101 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
 102
 103 /*
 104  * Virtual File System System Calls
 105  */
 106
 107 /*
 108  * Mount a file system.
 109  *
 110  * mount_args(char *type, char *path, int flags, caddr_t data)
 111  *
 112  * MPALMOSTSAFE
 113  */
 114 int
 115 sys_mount(struct mount_args *uap)
 116 {
 117         struct thread *td = curthread;
 118         struct vnode *vp;
 119         struct nchandle nch;
 120         struct mount *mp, *nullmp;
 121         struct vfsconf *vfsp;
 122         int error, flag = 0, flag2 = 0;
 123         int hasmount;
 124         struct vattr va;
 125         struct nlookupdata nd;
 126         char fstypename[MFSNAMELEN];
 127         struct ucred *cred;
 128
 129         get_mplock();
 130         cred = td->td_ucred;
 131         if (jailed(cred)) {
 132                 error = EPERM;
 133                 goto done;
 134         }
 135         if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
 136                 goto done;
 137
 138         /*
 139          * Do not allow NFS export by non-root users.
 140          */
 141         if (uap->flags & MNT_EXPORTED) {
 142                 error = priv_check(td, PRIV_ROOT);
 143                 if (error)
 144                         goto done;
 145         }
 146         /*
 147          * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
 148          */
 149         if (priv_check(td, PRIV_ROOT))
 150                 uap->flags |= MNT_NOSUID | MNT_NODEV;
 151
 152         /*
 153          * Lookup the requested path and extract the nch and vnode.
 154          */
 155         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
 156         if (error == 0) {
 157                 if ((error = nlookup(&nd)) == 0) {
 158                         if (nd.nl_nch.ncp->nc_vp == NULL)
 159                                 error = ENOENT;
 160                 }
 161         }
 162         if (error) {
 163                 nlookup_done(&nd);
 164                 goto done;
 165         }
 166
 167         /*
 168          * If the target filesystem is resolved via a nullfs mount, then
 169          * nd.nl_nch.mount will be pointing to the nullfs mount structure
 170          * instead of the target file system. We need it in case we are
 171          * doing an update.
 172          */
 173         nullmp = nd.nl_nch.mount;
 174
 175         /*
 176          * Extract the locked+refd ncp and cleanup the nd structure
 177          */
 178         nch = nd.nl_nch;
 179         cache_zero(&nd.nl_nch);
 180         nlookup_done(&nd);
 181
 182         if ((nch.ncp->nc_flag & NCF_ISMOUNTPT) && cache_findmount(&nch))
 183                 hasmount = 1;
 184         else
 185                 hasmount = 0;
 186
 187
 188         /*
 189          * now we have the locked ref'd nch and unreferenced vnode.
 190          */
 191         vp = nch.ncp->nc_vp;
 192         if ((error = vget(vp, LK_EXCLUSIVE)) != 0) {
 193                 cache_put(&nch);
 194                 goto done;
 195         }
 196         cache_unlock(&nch);
 197
 198         /*
 199          * Extract the file system type. We need to know this early, to take
 200          * appropriate actions if we are dealing with a nullfs.
 201          */
 202         if ((error = copyinstr(uap->type, fstypename, MFSNAMELEN, NULL)) != 0) {
 203                 cache_drop(&nch);
 204                 vput(vp);
 205                 goto done;
 206         }
 207
 208         /*
 209          * Now we have an unlocked ref'd nch and a locked ref'd vp
 210          */
 211         if (uap->flags & MNT_UPDATE) {
 212                 if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
 213                         cache_drop(&nch);
 214                         vput(vp);
 215                         error = EINVAL;
 216                         goto done;
 217                 }
 218
 219                 if (strncmp(fstypename, "null", 5) == 0) {
 220                         KKASSERT(nullmp);
 221                         mp = nullmp;
 222                 } else {
 223                         mp = vp->v_mount;
 224                 }
 225
 226                 flag = mp->mnt_flag;
 227                 flag2 = mp->mnt_kern_flag;
 228                 /*
 229                  * We only allow the filesystem to be reloaded if it
 230                  * is currently mounted read-only.
 231                  */
 232                 if ((uap->flags & MNT_RELOAD) &&
 233                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 234                         cache_drop(&nch);
 235                         vput(vp);
 236                         error = EOPNOTSUPP;     /* Needs translation */
 237                         goto done;
 238                 }
 239                 /*
 240                  * Only root, or the user that did the original mount is
 241                  * permitted to update it.
 242                  */
 243                 if (mp->mnt_stat.f_owner != cred->cr_uid &&
 244                     (error = priv_check(td, PRIV_ROOT))) {
 245                         cache_drop(&nch);
 246                         vput(vp);
 247                         goto done;
 248                 }
 249                 if (vfs_busy(mp, LK_NOWAIT)) {
 250                         cache_drop(&nch);
 251                         vput(vp);
 252                         error = EBUSY;
 253                         goto done;
 254                 }
 255                 if ((vp->v_flag & VMOUNT) != 0 || hasmount) {
 256                         cache_drop(&nch);
 257                         vfs_unbusy(mp);
 258                         vput(vp);
 259                         error = EBUSY;
 260                         goto done;
 261                 }
 262                 vp->v_flag |= VMOUNT;
 263                 mp->mnt_flag |=
 264                     uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 265                 vn_unlock(vp);
 266                 goto update;
 267         }
 268         /*
 269          * If the user is not root, ensure that they own the directory
 270          * onto which we are attempting to mount.
 271          */
 272         if ((error = VOP_GETATTR(vp, &va)) ||
 273             (va.va_uid != cred->cr_uid && (error = priv_check(td, PRIV_ROOT)))) {
 274                 cache_drop(&nch);
 275                 vput(vp);
 276                 goto done;
 277         }
 278         if ((error = vinvalbuf(vp, V_SAVE, 0, 0)) != 0) {
 279                 cache_drop(&nch);
 280                 vput(vp);
 281                 goto done;
 282         }
 283         if (vp->v_type != VDIR) {
 284                 cache_drop(&nch);
 285                 vput(vp);
 286                 error = ENOTDIR;
 287                 goto done;
 288         }
 289         if (vp->v_mount->mnt_kern_flag & MNTK_NOSTKMNT) {
 290                 cache_drop(&nch);
 291                 vput(vp);
 292                 error = EPERM;
 293                 goto done;
 294         }
 295         vfsp = vfsconf_find_by_name(fstypename);
 296         if (vfsp == NULL) {
 297                 linker_file_t lf;
 298
 299                 /* Only load modules for root (very important!) */
 300                 if ((error = priv_check(td, PRIV_ROOT)) != 0) {
 301                         cache_drop(&nch);
 302                         vput(vp);
 303                         goto done;
 304                 }
 305                 error = linker_load_file(fstypename, &lf);
 306                 if (error || lf == NULL) {
 307                         cache_drop(&nch);
 308                         vput(vp);
 309                         if (lf == NULL)
 310                                 error = ENODEV;
 311                         goto done;
 312                 }
 313                 lf->userrefs++;
 314                 /* lookup again, see if the VFS was loaded */
 315                 vfsp = vfsconf_find_by_name(fstypename);
 316                 if (vfsp == NULL) {
 317                         lf->userrefs--;
 318                         linker_file_unload(lf);
 319                         cache_drop(&nch);
 320                         vput(vp);
 321                         error = ENODEV;
 322                         goto done;
 323                 }
 324         }
 325         if ((vp->v_flag & VMOUNT) != 0 || hasmount) {
 326                 cache_drop(&nch);
 327                 vput(vp);
 328                 error = EBUSY;
 329                 goto done;
 330         }
 331         vp->v_flag |= VMOUNT;
 332
 333         /*
 334          * Allocate and initialize the filesystem.
 335          */
 336         mp = kmalloc(sizeof(struct mount), M_MOUNT, M_ZERO|M_WAITOK);
 337         TAILQ_INIT(&mp->mnt_nvnodelist);
 338         TAILQ_INIT(&mp->mnt_reservedvnlist);
 339         TAILQ_INIT(&mp->mnt_jlist);
 340         mp->mnt_nvnodelistsize = 0;
 341         lockinit(&mp->mnt_lock, "vfslock", 0, 0);
 342         vfs_busy(mp, LK_NOWAIT);
 343         mp->mnt_op = vfsp->vfc_vfsops;
 344         mp->mnt_vfc = vfsp;
 345         vfsp->vfc_refcount++;
 346         mp->mnt_stat.f_type = vfsp->vfc_typenum;
 347         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 348         strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 349         mp->mnt_stat.f_owner = cred->cr_uid;
 350         mp->mnt_iosize_max = DFLTPHYS;
 351         vn_unlock(vp);
 352 update:
 353         /*
 354          * Set the mount level flags.
 355          */
 356         if (uap->flags & MNT_RDONLY)
 357                 mp->mnt_flag |= MNT_RDONLY;
 358         else if (mp->mnt_flag & MNT_RDONLY)
 359                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 360         mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 361             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME |
 362             MNT_NOSYMFOLLOW | MNT_IGNORE |
 363             MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
 364         mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC |
 365             MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
 366             MNT_NOSYMFOLLOW | MNT_IGNORE |
 367             MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
 368         /*
 369          * Mount the filesystem.
 370          * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 371          * get.
 372          */
 373         error = VFS_MOUNT(mp, uap->path, uap->data, cred);
 374         if (mp->mnt_flag & MNT_UPDATE) {
 375                 if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 376                         mp->mnt_flag &= ~MNT_RDONLY;
 377                 mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
 378                 mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 379                 if (error) {
 380                         mp->mnt_flag = flag;
 381                         mp->mnt_kern_flag = flag2;
 382                 }
 383                 vfs_unbusy(mp);
 384                 vp->v_flag &= ~VMOUNT;
 385                 vrele(vp);
 386                 cache_drop(&nch);
 387                 goto done;
 388         }
 389         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 390         /*
 391          * Put the new filesystem on the mount list after root.  The mount
 392          * point gets its own mnt_ncmountpt (unless the VFS already set one
 393          * up) which represents the root of the mount.  The lookup code
 394          * detects the mount point going forward and checks the root of
 395          * the mount going backwards.
 396          *
 397          * It is not necessary to invalidate or purge the vnode underneath
 398          * because elements under the mount will be given their own glue
 399          * namecache record.
 400          */
 401         if (!error) {
 402                 if (mp->mnt_ncmountpt.ncp == NULL) {
 403                         /*
 404                          * allocate, then unlock, but leave the ref intact
 405                          */
 406                         cache_allocroot(&mp->mnt_ncmountpt, mp, NULL);
 407                         cache_unlock(&mp->mnt_ncmountpt);
 408                 }
 409                 mp->mnt_ncmounton = nch;                /* inherits ref */
 410                 nch.ncp->nc_flag |= NCF_ISMOUNTPT;
 411
 412                 /* XXX get the root of the fs and cache_setvp(mnt_ncmountpt...) */
 413                 vp->v_flag &= ~VMOUNT;
 414                 mountlist_insert(mp, MNTINS_LAST);
 415                 vn_unlock(vp);
 416                 checkdirs(&mp->mnt_ncmounton, &mp->mnt_ncmountpt);
 417                 error = vfs_allocate_syncvnode(mp);
 418                 vfs_unbusy(mp);
 419                 error = VFS_START(mp, 0);
 420                 vrele(vp);
 421         } else {
 422                 vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
 423                 vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
 424                 vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
 425                 vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
 426                 vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
 427                 vp->v_flag &= ~VMOUNT;
 428                 mp->mnt_vfc->vfc_refcount--;
 429                 vfs_unbusy(mp);
 430                 kfree(mp, M_MOUNT);
 431                 cache_drop(&nch);
 432                 vput(vp);
 433         }
 434 done:
 435         rel_mplock();
 436         return (error);
 437 }
 438
 439 /*
 440  * Scan all active processes to see if any of them have a current
 441  * or root directory onto which the new filesystem has just been
 442  * mounted. If so, replace them with the new mount point.
 443  *
 444  * The passed ncp is ref'd and locked (from the mount code) and
 445  * must be associated with the vnode representing the root of the
 446  * mount point.
 447  */
 448 struct checkdirs_info {
 449         struct nchandle old_nch;
 450         struct nchandle new_nch;
 451         struct vnode *old_vp;
 452         struct vnode *new_vp;
 453 };
 454
 455 static int checkdirs_callback(struct proc *p, void *data);
 456
 457 static void
 458 checkdirs(struct nchandle *old_nch, struct nchandle *new_nch)
 459 {
 460         struct checkdirs_info info;
 461         struct vnode *olddp;
 462         struct vnode *newdp;
 463         struct mount *mp;
 464
 465         /*
 466          * If the old mount point's vnode has a usecount of 1, it is not
 467          * being held as a descriptor anywhere.
 468          */
 469         olddp = old_nch->ncp->nc_vp;
 470         if (olddp == NULL || olddp->v_sysref.refcnt == 1)
 471                 return;
 472
 473         /*
 474          * Force the root vnode of the new mount point to be resolved
 475          * so we can update any matching processes.
 476          */
 477         mp = new_nch->mount;
 478         if (VFS_ROOT(mp, &newdp))
 479                 panic("mount: lost mount");
 480         cache_setunresolved(new_nch);
 481         cache_setvp(new_nch, newdp);
 482
 483         /*
 484          * Special handling of the root node
 485          */
 486         if (rootvnode == olddp) {
 487                 vref(newdp);
 488                 vfs_cache_setroot(newdp, cache_hold(new_nch));
 489         }
 490
 491         /*
 492          * Pass newdp separately so the callback does not have to access
 493          * it via new_nch->ncp->nc_vp.
 494          */
 495         info.old_nch = *old_nch;
 496         info.new_nch = *new_nch;
 497         info.new_vp = newdp;
 498         allproc_scan(checkdirs_callback, &info);
 499         vput(newdp);
 500 }
 501
 502 /*
 503  * NOTE: callback is not MP safe because the scanned process's filedesc
 504  * structure can be ripped out from under us, amoung other things.
 505  */
 506 static int
 507 checkdirs_callback(struct proc *p, void *data)
 508 {
 509         struct checkdirs_info *info = data;
 510         struct filedesc *fdp;
 511         struct nchandle ncdrop1;
 512         struct nchandle ncdrop2;
 513         struct vnode *vprele1;
 514         struct vnode *vprele2;
 515
 516         if ((fdp = p->p_fd) != NULL) {
 517                 cache_zero(&ncdrop1);
 518                 cache_zero(&ncdrop2);
 519                 vprele1 = NULL;
 520                 vprele2 = NULL;
 521
 522                 /*
 523                  * MPUNSAFE - XXX fdp can be pulled out from under a
 524                  * foreign process.
 525                  *
 526                  * A shared filedesc is ok, we don't have to copy it
 527                  * because we are making this change globally.
 528                  */
 529                 spin_lock_wr(&fdp->fd_spin);
 530                 if (fdp->fd_ncdir.mount == info->old_nch.mount &&
 531                     fdp->fd_ncdir.ncp == info->old_nch.ncp) {
 532                         vprele1 = fdp->fd_cdir;
 533                         vref(info->new_vp);
 534                         fdp->fd_cdir = info->new_vp;
 535                         ncdrop1 = fdp->fd_ncdir;
 536                         cache_copy(&info->new_nch, &fdp->fd_ncdir);
 537                 }
 538                 if (fdp->fd_nrdir.mount == info->old_nch.mount &&
 539                     fdp->fd_nrdir.ncp == info->old_nch.ncp) {
 540                         vprele2 = fdp->fd_rdir;
 541                         vref(info->new_vp);
 542                         fdp->fd_rdir = info->new_vp;
 543                         ncdrop2 = fdp->fd_nrdir;
 544                         cache_copy(&info->new_nch, &fdp->fd_nrdir);
 545                 }
 546                 spin_unlock_wr(&fdp->fd_spin);
 547                 if (ncdrop1.ncp)
 548                         cache_drop(&ncdrop1);
 549                 if (ncdrop2.ncp)
 550                         cache_drop(&ncdrop2);
 551                 if (vprele1)
 552                         vrele(vprele1);
 553                 if (vprele2)
 554                         vrele(vprele2);
 555         }
 556         return(0);
 557 }
 558
 559 /*
 560  * Unmount a file system.
 561  *
 562  * Note: unmount takes a path to the vnode mounted on as argument,
 563  * not special file (as before).
 564  *
 565  * umount_args(char *path, int flags)
 566  *
 567  * MPALMOSTSAFE
 568  */
 569 int
 570 sys_unmount(struct unmount_args *uap)
 571 {
 572         struct thread *td = curthread;
 573         struct proc *p = td->td_proc;
 574         struct mount *mp = NULL;
 575         struct nlookupdata nd;
 576         int error;
 577
 578         KKASSERT(p);
 579         get_mplock();
 580         if (td->td_ucred->cr_prison != NULL) {
 581                 error = EPERM;
 582                 goto done;
 583         }
 584         if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
 585                 goto done;
 586
 587         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
 588         if (error == 0)
 589                 error = nlookup(&nd);
 590         if (error)
 591                 goto out;
 592
 593         mp = nd.nl_nch.mount;
 594
 595         /*
 596          * Only root, or the user that did the original mount is
 597          * permitted to unmount this filesystem.
 598          */
 599         if ((mp->mnt_stat.f_owner != td->td_ucred->cr_uid) &&
 600             (error = priv_check(td, PRIV_ROOT)))
 601                 goto out;
 602
 603         /*
 604          * Don't allow unmounting the root file system.
 605          */
 606         if (mp->mnt_flag & MNT_ROOTFS) {
 607                 error = EINVAL;
 608                 goto out;
 609         }
 610
 611         /*
 612          * Must be the root of the filesystem
 613          */
 614         if (nd.nl_nch.ncp != mp->mnt_ncmountpt.ncp) {
 615                 error = EINVAL;
 616                 goto out;
 617         }
 618
 619 out:
 620         nlookup_done(&nd);
 621         if (error == 0)
 622                 error = dounmount(mp, uap->flags);
 623 done:
 624         rel_mplock();
 625         return (error);
 626 }
 627
 628 /*
 629  * Do the actual file system unmount.
 630  */
 631 static int
 632 dounmount_interlock(struct mount *mp)
 633 {
 634         if (mp->mnt_kern_flag & MNTK_UNMOUNT)
 635                 return (EBUSY);
 636         mp->mnt_kern_flag |= MNTK_UNMOUNT;
 637         return(0);
 638 }
 639
 640 int
 641 dounmount(struct mount *mp, int flags)
 642 {
 643         struct namecache *ncp;
 644         struct nchandle nch;
 645         struct vnode *vp;
 646         int error;
 647         int async_flag;
 648         int lflags;
 649         int freeok = 1;
 650
 651         /*
 652          * Exclusive access for unmounting purposes
 653          */
 654         if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0)
 655                 return (error);
 656
 657         /*
 658          * Allow filesystems to detect that a forced unmount is in progress.
 659          */
 660         if (flags & MNT_FORCE)
 661                 mp->mnt_kern_flag |= MNTK_UNMOUNTF;
 662         lflags = LK_EXCLUSIVE | ((flags & MNT_FORCE) ? 0 : LK_NOWAIT);
 663         error = lockmgr(&mp->mnt_lock, lflags);
 664         if (error) {
 665                 mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 666                 if (mp->mnt_kern_flag & MNTK_MWAIT)
 667                         wakeup(mp);
 668                 return (error);
 669         }
 670
 671         if (mp->mnt_flag & MNT_EXPUBLIC)
 672                 vfs_setpublicfs(NULL, NULL, NULL);
 673
 674         vfs_msync(mp, MNT_WAIT);
 675         async_flag = mp->mnt_flag & MNT_ASYNC;
 676         mp->mnt_flag &=~ MNT_ASYNC;
 677
 678         /*
 679          * If this filesystem isn't aliasing other filesystems,
 680          * try to invalidate any remaining namecache entries and
 681          * check the count afterwords.
 682          */
 683         if ((mp->mnt_kern_flag & MNTK_NCALIASED) == 0) {
 684                 cache_lock(&mp->mnt_ncmountpt);
 685                 cache_inval(&mp->mnt_ncmountpt, CINV_DESTROY|CINV_CHILDREN);
 686                 cache_unlock(&mp->mnt_ncmountpt);
 687
 688                 if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
 689                     (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
 690
 691                         if ((flags & MNT_FORCE) == 0) {
 692                                 error = EBUSY;
 693                                 mount_warning(mp, "Cannot unmount: "
 694                                                   "%d namecache "
 695                                                   "references still "
 696                                                   "present",
 697                                                   ncp->nc_refs - 1);
 698                         } else {
 699                                 mount_warning(mp, "Forced unmount: "
 700                                                   "%d namecache "
 701                                                   "references still "
 702                                                   "present",
 703                                                   ncp->nc_refs - 1);
 704                                 freeok = 0;
 705                         }
 706                 }
 707         }
 708
 709         /*
 710          * nchandle records ref the mount structure.  Expect a count of 1
 711          * (our mount->mnt_ncmountpt).
 712          */
 713         if (mp->mnt_refs != 1) {
 714                 if ((flags & MNT_FORCE) == 0) {
 715                         mount_warning(mp, "Cannot unmount: "
 716                                           "%d process references still "
 717                                           "present", mp->mnt_refs);
 718                         error = EBUSY;
 719                 } else {
 720                         mount_warning(mp, "Forced unmount: "
 721                                           "%d process references still "
 722                                           "present", mp->mnt_refs);
 723                         freeok = 0;
 724                 }
 725         }
 726
 727         /*
 728          * Decomission our special mnt_syncer vnode.  This also stops
 729          * the vnlru code.  If we are unable to unmount we recommission
 730          * the vnode.
 731          */
 732         if (error == 0) {
 733                 if ((vp = mp->mnt_syncer) != NULL) {
 734                         mp->mnt_syncer = NULL;
 735                         vrele(vp);
 736                 }
 737                 if (((mp->mnt_flag & MNT_RDONLY) ||
 738                      (error = VFS_SYNC(mp, MNT_WAIT)) == 0) ||
 739                     (flags & MNT_FORCE)) {
 740                         error = VFS_UNMOUNT(mp, flags);
 741                 }
 742         }
 743         if (error) {
 744                 if (mp->mnt_syncer == NULL)
 745                         vfs_allocate_syncvnode(mp);
 746                 mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 747                 mp->mnt_flag |= async_flag;
 748                 lockmgr(&mp->mnt_lock, LK_RELEASE);
 749                 if (mp->mnt_kern_flag & MNTK_MWAIT)
 750                         wakeup(mp);
 751                 return (error);
 752         }
 753         /*
 754          * Clean up any journals still associated with the mount after
 755          * filesystem activity has ceased.
 756          */
 757         journal_remove_all_journals(mp,
 758             ((flags & MNT_FORCE) ? MC_JOURNAL_STOP_IMM : 0));
 759
 760         mountlist_remove(mp);
 761
 762         /*
 763          * Remove any installed vnode ops here so the individual VFSs don't
 764          * have to.
 765          */
 766         vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
 767         vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
 768         vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
 769         vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
 770         vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
 771
 772         if (mp->mnt_ncmountpt.ncp != NULL) {
 773                 nch = mp->mnt_ncmountpt;
 774                 cache_zero(&mp->mnt_ncmountpt);
 775                 cache_clrmountpt(&nch);
 776                 cache_drop(&nch);
 777         }
 778         if (mp->mnt_ncmounton.ncp != NULL) {
 779                 nch = mp->mnt_ncmounton;
 780                 cache_zero(&mp->mnt_ncmounton);
 781                 cache_clrmountpt(&nch);
 782                 cache_drop(&nch);
 783         }
 784
 785         mp->mnt_vfc->vfc_refcount--;
 786         if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
 787                 panic("unmount: dangling vnode");
 788         lockmgr(&mp->mnt_lock, LK_RELEASE);
 789         if (mp->mnt_kern_flag & MNTK_MWAIT)
 790                 wakeup(mp);
 791         if (freeok)
 792                 kfree(mp, M_MOUNT);
 793         return (0);
 794 }
 795
 796 static
 797 void
 798 mount_warning(struct mount *mp, const char *ctl, ...)
 799 {
 800         char *ptr;
 801         char *buf;
 802         __va_list va;
 803
 804         __va_start(va, ctl);
 805         if (cache_fullpath(NULL, &mp->mnt_ncmounton, &ptr, &buf) == 0) {
 806                 kprintf("unmount(%s): ", ptr);
 807                 kvprintf(ctl, va);
 808                 kprintf("\n");
 809                 kfree(buf, M_TEMP);
 810         } else {
 811                 kprintf("unmount(%p", mp);
 812                 if (mp->mnt_ncmounton.ncp && mp->mnt_ncmounton.ncp->nc_name)
 813                         kprintf(",%s", mp->mnt_ncmounton.ncp->nc_name);
 814                 kprintf("): ");
 815                 kvprintf(ctl, va);
 816                 kprintf("\n");
 817         }
 818         __va_end(va);
 819 }
 820
 821 /*
 822  * Shim cache_fullpath() to handle the case where a process is chrooted into
 823  * a subdirectory of a mount.  In this case if the root mount matches the
 824  * process root directory's mount we have to specify the process's root
 825  * directory instead of the mount point, because the mount point might
 826  * be above the root directory.
 827  */
 828 static
 829 int
 830 mount_path(struct proc *p, struct mount *mp, char **rb, char **fb)
 831 {
 832         struct nchandle *nch;
 833
 834         if (p && p->p_fd->fd_nrdir.mount == mp)
 835                 nch = &p->p_fd->fd_nrdir;
 836         else
 837                 nch = &mp->mnt_ncmountpt;
 838         return(cache_fullpath(p, nch, rb, fb));
 839 }
 840
 841 /*
 842  * Sync each mounted filesystem.
 843  */
 844
 845 #ifdef DEBUG
 846 static int syncprt = 0;
 847 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
 848 #endif /* DEBUG */
 849
 850 static int sync_callback(struct mount *mp, void *data);
 851
 852 /*
 853  * MPALMOSTSAFE
 854  */
 855 int
 856 sys_sync(struct sync_args *uap)
 857 {
 858         get_mplock();
 859         mountlist_scan(sync_callback, NULL, MNTSCAN_FORWARD);
 860 #ifdef DEBUG
 861         /*
 862          * print out buffer pool stat information on each sync() call.
 863          */
 864         if (syncprt)
 865                 vfs_bufstats();
 866 #endif /* DEBUG */
 867         rel_mplock();
 868         return (0);
 869 }
 870
 871 static
 872 int
 873 sync_callback(struct mount *mp, void *data __unused)
 874 {
 875         int asyncflag;
 876
 877         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 878                 asyncflag = mp->mnt_flag & MNT_ASYNC;
 879                 mp->mnt_flag &= ~MNT_ASYNC;
 880                 vfs_msync(mp, MNT_NOWAIT);
 881                 VFS_SYNC(mp, MNT_NOWAIT);
 882                 mp->mnt_flag |= asyncflag;
 883         }
 884         return(0);
 885 }
 886
 887 /* XXX PRISON: could be per prison flag */
 888 static int prison_quotas;
 889 #if 0
 890 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
 891 #endif
 892
 893 /*
 894  *  quotactl_args(char *path, int fcmd, int uid, caddr_t arg)
 895  *
 896  * Change filesystem quotas.
 897  *
 898  * MPALMOSTSAFE
 899  */
 900 int
 901 sys_quotactl(struct quotactl_args *uap)
 902 {
 903         struct nlookupdata nd;
 904         struct thread *td;
 905         struct proc *p;
 906         struct mount *mp;
 907         int error;
 908
 909         get_mplock();
 910         td = curthread;
 911         p = td->td_proc;
 912         if (td->td_ucred->cr_prison && !prison_quotas) {
 913                 error = EPERM;
 914                 goto done;
 915         }
 916
 917         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
 918         if (error == 0)
 919                 error = nlookup(&nd);
 920         if (error == 0) {
 921                 mp = nd.nl_nch.mount;
 922                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid,
 923                                     uap->arg, nd.nl_cred);
 924         }
 925         nlookup_done(&nd);
 926 done:
 927         rel_mplock();
 928         return (error);
 929 }
 930
 931 /*
 932  * mountctl(char *path, int op, int fd, const void *ctl, int ctllen,
 933  *              void *buf, int buflen)
 934  *
 935  * This function operates on a mount point and executes the specified
 936  * operation using the specified control data, and possibly returns data.
 937  *
 938  * The actual number of bytes stored in the result buffer is returned, 0
 939  * if none, otherwise an error is returned.
 940  *
 941  * MPALMOSTSAFE
 942  */
 943 int
 944 sys_mountctl(struct mountctl_args *uap)
 945 {
 946         struct thread *td = curthread;
 947         struct proc *p = td->td_proc;
 948         struct file *fp;
 949         void *ctl = NULL;
 950         void *buf = NULL;
 951         char *path = NULL;
 952         int error;
 953
 954         /*
 955          * Sanity and permissions checks.  We must be root.
 956          */
 957         KKASSERT(p);
 958         if (td->td_ucred->cr_prison != NULL)
 959                 return (EPERM);
 960         if ((uap->op != MOUNTCTL_MOUNTFLAGS) &&
 961             (error = priv_check(td, PRIV_ROOT)) != 0)
 962                 return (error);
 963
 964         /*
 965          * Argument length checks
 966          */
 967         if (uap->ctllen < 0 || uap->ctllen > 1024)
 968                 return (EINVAL);
 969         if (uap->buflen < 0 || uap->buflen > 16 * 1024)
 970                 return (EINVAL);
 971         if (uap->path == NULL)
 972                 return (EINVAL);
 973
 974         /*
 975          * Allocate the necessary buffers and copyin data
 976          */
 977         path = objcache_get(namei_oc, M_WAITOK);
 978         error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
 979         if (error)
 980                 goto done;
 981
 982         if (uap->ctllen) {
 983                 ctl = kmalloc(uap->ctllen + 1, M_TEMP, M_WAITOK|M_ZERO);
 984                 error = copyin(uap->ctl, ctl, uap->ctllen);
 985                 if (error)
 986                         goto done;
 987         }
 988         if (uap->buflen)
 989                 buf = kmalloc(uap->buflen + 1, M_TEMP, M_WAITOK|M_ZERO);
 990
 991         /*
 992          * Validate the descriptor
 993          */
 994         if (uap->fd >= 0) {
 995                 fp = holdfp(p->p_fd, uap->fd, -1);
 996                 if (fp == NULL) {
 997                         error = EBADF;
 998                         goto done;
 999                 }
1000         } else {
1001                 fp = NULL;
1002         }
1003
1004         /*
1005          * Execute the internal kernel function and clean up.
1006          */
1007         get_mplock();
1008         error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen, buf, uap->buflen, &uap->sysmsg_result);
1009         rel_mplock();
1010         if (fp)
1011                 fdrop(fp);
1012         if (error == 0 && uap->sysmsg_result > 0)
1013                 error = copyout(buf, uap->buf, uap->sysmsg_result);
1014 done:
1015         if (path)
1016                 objcache_put(namei_oc, path);
1017         if (ctl)
1018                 kfree(ctl, M_TEMP);
1019         if (buf)
1020                 kfree(buf, M_TEMP);
1021         return (error);
1022 }
1023
1024 /*
1025  * Execute a mount control operation by resolving the path to a mount point
1026  * and calling vop_mountctl().
1027  *
1028  * Use the mount point from the nch instead of the vnode so nullfs mounts
1029  * can properly spike the VOP.
1030  */
1031 int
1032 kern_mountctl(const char *path, int op, struct file *fp,
1033                 const void *ctl, int ctllen,
1034                 void *buf, int buflen, int *res)
1035 {
1036         struct vnode *vp;
1037         struct mount *mp;
1038         struct nlookupdata nd;
1039         int error;
1040
1041         *res = 0;
1042         vp = NULL;
1043         error = nlookup_init(&nd, path, UIO_SYSSPACE, NLC_FOLLOW);
1044         if (error == 0)
1045                 error = nlookup(&nd);
1046         if (error == 0)
1047                 error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
1048         mp = nd.nl_nch.mount;
1049         nlookup_done(&nd);
1050         if (error)
1051                 return (error);
1052         vn_unlock(vp);
1053
1054         /*
1055          * Must be the root of the filesystem
1056          */
1057         if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
1058                 vrele(vp);
1059                 return (EINVAL);
1060         }
1061         error = vop_mountctl(mp->mnt_vn_use_ops, op, fp, ctl, ctllen,
1062                                 buf, buflen, res);
1063         vrele(vp);
1064         return (error);
1065 }
1066
1067 int
1068 kern_statfs(struct nlookupdata *nd, struct statfs *buf)
1069 {
1070         struct thread *td = curthread;
1071         struct proc *p = td->td_proc;
1072         struct mount *mp;
1073         struct statfs *sp;
1074         char *fullpath, *freepath;
1075         int error;
1076
1077         if ((error = nlookup(nd)) != 0)
1078                 return (error);
1079         mp = nd->nl_nch.mount;
1080         sp = &mp->mnt_stat;
1081         if ((error = VFS_STATFS(mp, sp, nd->nl_cred)) != 0)
1082                 return (error);
1083
1084         error = mount_path(p, mp, &fullpath, &freepath);
1085         if (error)
1086                 return(error);
1087         bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1088         strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1089         kfree(freepath, M_TEMP);
1090
1091         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1092         bcopy(sp, buf, sizeof(*buf));
1093         /* Only root should have access to the fsid's. */
1094         if (priv_check(td, PRIV_ROOT))
1095                 buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1096         return (0);
1097 }
1098
1099 /*
1100  * statfs_args(char *path, struct statfs *buf)
1101  *
1102  * Get filesystem statistics.
1103  *
1104  * MPALMOSTSAFE
1105  */
1106 int
1107 sys_statfs(struct statfs_args *uap)
1108 {
1109         struct nlookupdata nd;
1110         struct statfs buf;
1111         int error;
1112
1113         get_mplock();
1114         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1115         if (error == 0)
1116                 error = kern_statfs(&nd, &buf);
1117         nlookup_done(&nd);
1118         if (error == 0)
1119                 error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1120         rel_mplock();
1121         return (error);
1122 }
1123
1124 /*
1125  * MPALMOSTSAFE
1126  */
1127 int
1128 kern_fstatfs(int fd, struct statfs *buf)
1129 {
1130         struct thread *td = curthread;
1131         struct proc *p = td->td_proc;
1132         struct file *fp;
1133         struct mount *mp;
1134         struct statfs *sp;
1135         char *fullpath, *freepath;
1136         int error;
1137
1138         KKASSERT(p);
1139         if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
1140                 return (error);
1141         get_mplock();
1142         mp = ((struct vnode *)fp->f_data)->v_mount;
1143         if (mp == NULL) {
1144                 error = EBADF;
1145                 goto done;
1146         }
1147         if (fp->f_cred == NULL) {
1148                 error = EINVAL;
1149                 goto done;
1150         }
1151         sp = &mp->mnt_stat;
1152         if ((error = VFS_STATFS(mp, sp, fp->f_cred)) != 0)
1153                 goto done;
1154
1155         if ((error = mount_path(p, mp, &fullpath, &freepath)) != 0)
1156                 goto done;
1157         bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1158         strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1159         kfree(freepath, M_TEMP);
1160
1161         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1162         bcopy(sp, buf, sizeof(*buf));
1163
1164         /* Only root should have access to the fsid's. */
1165         if (priv_check(td, PRIV_ROOT))
1166                 buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1167         error = 0;
1168 done:
1169         rel_mplock();
1170         fdrop(fp);
1171         return (error);
1172 }
1173
1174 /*
1175  * fstatfs_args(int fd, struct statfs *buf)
1176  *
1177  * Get filesystem statistics.
1178  *
1179  * MPSAFE
1180  */
1181 int
1182 sys_fstatfs(struct fstatfs_args *uap)
1183 {
1184         struct statfs buf;
1185         int error;
1186
1187         error = kern_fstatfs(uap->fd, &buf);
1188
1189         if (error == 0)
1190                 error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1191         return (error);
1192 }
1193
1194 int
1195 kern_statvfs(struct nlookupdata *nd, struct statvfs *buf)
1196 {
1197         struct mount *mp;
1198         struct statvfs *sp;
1199         int error;
1200
1201         if ((error = nlookup(nd)) != 0)
1202                 return (error);
1203         mp = nd->nl_nch.mount;
1204         sp = &mp->mnt_vstat;
1205         if ((error = VFS_STATVFS(mp, sp, nd->nl_cred)) != 0)
1206                 return (error);
1207
1208         sp->f_flag = 0;
1209         if (mp->mnt_flag & MNT_RDONLY)
1210                 sp->f_flag |= ST_RDONLY;
1211         if (mp->mnt_flag & MNT_NOSUID)
1212                 sp->f_flag |= ST_NOSUID;
1213         bcopy(sp, buf, sizeof(*buf));
1214         return (0);
1215 }
1216
1217 /*
1218  * statfs_args(char *path, struct statfs *buf)
1219  *
1220  * Get filesystem statistics.
1221  *
1222  * MPALMOSTSAFE
1223  */
1224 int
1225 sys_statvfs(struct statvfs_args *uap)
1226 {
1227         struct nlookupdata nd;
1228         struct statvfs buf;
1229         int error;
1230
1231         get_mplock();
1232         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1233         if (error == 0)
1234                 error = kern_statvfs(&nd, &buf);
1235         nlookup_done(&nd);
1236         if (error == 0)
1237                 error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1238         rel_mplock();
1239         return (error);
1240 }
1241
1242 int
1243 kern_fstatvfs(int fd, struct statvfs *buf)
1244 {
1245         struct thread *td = curthread;
1246         struct proc *p = td->td_proc;
1247         struct file *fp;
1248         struct mount *mp;
1249         struct statvfs *sp;
1250         int error;
1251
1252         KKASSERT(p);
1253         if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
1254                 return (error);
1255         mp = ((struct vnode *)fp->f_data)->v_mount;
1256         if (mp == NULL) {
1257                 error = EBADF;
1258                 goto done;
1259         }
1260         if (fp->f_cred == NULL) {
1261                 error = EINVAL;
1262                 goto done;
1263         }
1264         sp = &mp->mnt_vstat;
1265         if ((error = VFS_STATVFS(mp, sp, fp->f_cred)) != 0)
1266                 goto done;
1267
1268         sp->f_flag = 0;
1269         if (mp->mnt_flag & MNT_RDONLY)
1270                 sp->f_flag |= ST_RDONLY;
1271         if (mp->mnt_flag & MNT_NOSUID)
1272                 sp->f_flag |= ST_NOSUID;
1273
1274         bcopy(sp, buf, sizeof(*buf));
1275         error = 0;
1276 done:
1277         fdrop(fp);
1278         return (error);
1279 }
1280
1281 /*
1282  * fstatfs_args(int fd, struct statfs *buf)
1283  *
1284  * Get filesystem statistics.
1285  *
1286  * MPALMOSTSAFE
1287  */
1288 int
1289 sys_fstatvfs(struct fstatvfs_args *uap)
1290 {
1291         struct statvfs buf;
1292         int error;
1293
1294         get_mplock();
1295         error = kern_fstatvfs(uap->fd, &buf);
1296         rel_mplock();
1297
1298         if (error == 0)
1299                 error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1300         return (error);
1301 }
1302
1303 /*
1304  * getfsstat_args(struct statfs *buf, long bufsize, int flags)
1305  *
1306  * Get statistics on all filesystems.
1307  */
1308
1309 struct getfsstat_info {
1310         struct statfs *sfsp;
1311         long count;
1312         long maxcount;
1313         int error;
1314         int flags;
1315         struct thread *td;
1316 };
1317
1318 static int getfsstat_callback(struct mount *, void *);
1319
1320 /*
1321  * MPALMOSTSAFE
1322  */
1323 int
1324 sys_getfsstat(struct getfsstat_args *uap)
1325 {
1326         struct thread *td = curthread;
1327         struct getfsstat_info info;
1328
1329         bzero(&info, sizeof(info));
1330
1331         info.maxcount = uap->bufsize / sizeof(struct statfs);
1332         info.sfsp = uap->buf;
1333         info.count = 0;
1334         info.flags = uap->flags;
1335         info.td = td;
1336
1337         get_mplock();
1338         mountlist_scan(getfsstat_callback, &info, MNTSCAN_FORWARD);
1339         rel_mplock();
1340         if (info.sfsp && info.count > info.maxcount)
1341                 uap->sysmsg_result = info.maxcount;
1342         else
1343                 uap->sysmsg_result = info.count;
1344         return (info.error);
1345 }
1346
1347 static int
1348 getfsstat_callback(struct mount *mp, void *data)
1349 {
1350         struct getfsstat_info *info = data;
1351         struct statfs *sp;
1352         char *freepath;
1353         char *fullpath;
1354         int error;
1355
1356         if (info->sfsp && info->count < info->maxcount) {
1357                 if (info->td->td_proc &&
1358                     !chroot_visible_mnt(mp, info->td->td_proc)) {
1359                         return(0);
1360                 }
1361                 sp = &mp->mnt_stat;
1362
1363                 /*
1364                  * If MNT_NOWAIT or MNT_LAZY is specified, do not
1365                  * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1366                  * overrides MNT_WAIT.
1367                  */
1368                 if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1369                     (info->flags & MNT_WAIT)) &&
1370                     (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1371                         return(0);
1372                 }
1373                 sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1374
1375                 error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1376                 if (error) {
1377                         info->error = error;
1378                         return(-1);
1379                 }
1380                 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1381                 strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1382                 kfree(freepath, M_TEMP);
1383
1384                 error = copyout(sp, info->sfsp, sizeof(*sp));
1385                 if (error) {
1386                         info->error = error;
1387                         return (-1);
1388                 }
1389                 ++info->sfsp;
1390         }
1391         info->count++;
1392         return(0);
1393 }
1394
1395 /*
1396  * getvfsstat_args(struct statfs *buf, struct statvfs *vbuf,
1397                    long bufsize, int flags)
1398  *
1399  * Get statistics on all filesystems.
1400  */
1401
1402 struct getvfsstat_info {
1403         struct statfs *sfsp;
1404         struct statvfs *vsfsp;
1405         long count;
1406         long maxcount;
1407         int error;
1408         int flags;
1409         struct thread *td;
1410 };
1411
1412 static int getvfsstat_callback(struct mount *, void *);
1413
1414 /*
1415  * MPALMOSTSAFE
1416  */
1417 int
1418 sys_getvfsstat(struct getvfsstat_args *uap)
1419 {
1420         struct thread *td = curthread;
1421         struct getvfsstat_info info;
1422
1423         bzero(&info, sizeof(info));
1424
1425         info.maxcount = uap->vbufsize / sizeof(struct statvfs);
1426         info.sfsp = uap->buf;
1427         info.vsfsp = uap->vbuf;
1428         info.count = 0;
1429         info.flags = uap->flags;
1430         info.td = td;
1431
1432         get_mplock();
1433         mountlist_scan(getvfsstat_callback, &info, MNTSCAN_FORWARD);
1434         if (info.vsfsp && info.count > info.maxcount)
1435                 uap->sysmsg_result = info.maxcount;
1436         else
1437                 uap->sysmsg_result = info.count;
1438         rel_mplock();
1439         return (info.error);
1440 }
1441
1442 static int
1443 getvfsstat_callback(struct mount *mp, void *data)
1444 {
1445         struct getvfsstat_info *info = data;
1446         struct statfs *sp;
1447         struct statvfs *vsp;
1448         char *freepath;
1449         char *fullpath;
1450         int error;
1451
1452         if (info->vsfsp && info->count < info->maxcount) {
1453                 if (info->td->td_proc &&
1454                     !chroot_visible_mnt(mp, info->td->td_proc)) {
1455                         return(0);
1456                 }
1457                 sp = &mp->mnt_stat;
1458                 vsp = &mp->mnt_vstat;
1459
1460                 /*
1461                  * If MNT_NOWAIT or MNT_LAZY is specified, do not
1462                  * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1463                  * overrides MNT_WAIT.
1464                  */
1465                 if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1466                     (info->flags & MNT_WAIT)) &&
1467                     (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1468                         return(0);
1469                 }
1470                 sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1471
1472                 if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1473                     (info->flags & MNT_WAIT)) &&
1474                     (error = VFS_STATVFS(mp, vsp, info->td->td_ucred))) {
1475                         return(0);
1476                 }
1477                 vsp->f_flag = 0;
1478                 if (mp->mnt_flag & MNT_RDONLY)
1479                         vsp->f_flag |= ST_RDONLY;
1480                 if (mp->mnt_flag & MNT_NOSUID)
1481                         vsp->f_flag |= ST_NOSUID;
1482
1483                 error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1484                 if (error) {
1485                         info->error = error;
1486                         return(-1);
1487                 }
1488                 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1489                 strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1490                 kfree(freepath, M_TEMP);
1491
1492                 error = copyout(sp, info->sfsp, sizeof(*sp));
1493                 if (error == 0)
1494                         error = copyout(vsp, info->vsfsp, sizeof(*vsp));
1495                 if (error) {
1496                         info->error = error;
1497                         return (-1);
1498                 }
1499                 ++info->sfsp;
1500                 ++info->vsfsp;
1501         }
1502         info->count++;
1503         return(0);
1504 }
1505
1506
1507 /*
1508  * fchdir_args(int fd)
1509  *
1510  * Change current working directory to a given file descriptor.
1511  *
1512  * MPALMOSTSAFE
1513  */
1514 int
1515 sys_fchdir(struct fchdir_args *uap)
1516 {
1517         struct thread *td = curthread;
1518         struct proc *p = td->td_proc;
1519         struct filedesc *fdp = p->p_fd;
1520         struct vnode *vp, *ovp;
1521         struct mount *mp;
1522         struct file *fp;
1523         struct nchandle nch, onch, tnch;
1524         int error;
1525
1526         if ((error = holdvnode(fdp, uap->fd, &fp)) != 0)
1527                 return (error);
1528         get_mplock();
1529         vp = (struct vnode *)fp->f_data;
1530         vref(vp);
1531         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1532         if (fp->f_nchandle.ncp == NULL)
1533                 error = ENOTDIR;
1534         else
1535                 error = checkvp_chdir(vp, td);
1536         if (error) {
1537                 vput(vp);
1538                 goto done;
1539         }
1540         cache_copy(&fp->f_nchandle, &nch);
1541
1542         /*
1543          * If the ncp has become a mount point, traverse through
1544          * the mount point.
1545          */
1546
1547         while (!error && (nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
1548                (mp = cache_findmount(&nch)) != NULL
1549         ) {
1550                 error = nlookup_mp(mp, &tnch);
1551                 if (error == 0) {
1552                         cache_unlock(&tnch);    /* leave ref intact */
1553                         vput(vp);
1554                         vp = tnch.ncp->nc_vp;
1555                         error = vget(vp, LK_SHARED);
1556                         KKASSERT(error == 0);
1557                         cache_drop(&nch);
1558                         nch = tnch;
1559                 }
1560         }
1561         if (error == 0) {
1562                 ovp = fdp->fd_cdir;
1563                 onch = fdp->fd_ncdir;
1564                 vn_unlock(vp);          /* leave ref intact */
1565                 fdp->fd_cdir = vp;
1566                 fdp->fd_ncdir = nch;
1567                 cache_drop(&onch);
1568                 vrele(ovp);
1569         } else {
1570                 cache_drop(&nch);
1571                 vput(vp);
1572         }
1573         fdrop(fp);
1574 done:
1575         rel_mplock();
1576         return (error);
1577 }
1578
1579 int
1580 kern_chdir(struct nlookupdata *nd)
1581 {
1582         struct thread *td = curthread;
1583         struct proc *p = td->td_proc;
1584         struct filedesc *fdp = p->p_fd;
1585         struct vnode *vp, *ovp;
1586         struct nchandle onch;
1587         int error;
1588
1589         if ((error = nlookup(nd)) != 0)
1590                 return (error);
1591         if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
1592                 return (ENOENT);
1593         if ((error = vget(vp, LK_SHARED)) != 0)
1594                 return (error);
1595
1596         error = checkvp_chdir(vp, td);
1597         vn_unlock(vp);
1598         if (error == 0) {
1599                 ovp = fdp->fd_cdir;
1600                 onch = fdp->fd_ncdir;
1601                 cache_unlock(&nd->nl_nch);      /* leave reference intact */
1602                 fdp->fd_ncdir = nd->nl_nch;
1603                 fdp->fd_cdir = vp;
1604                 cache_drop(&onch);
1605                 vrele(ovp);
1606                 cache_zero(&nd->nl_nch);
1607         } else {
1608                 vrele(vp);
1609         }
1610         return (error);
1611 }
1612
1613 /*
1614  * chdir_args(char *path)
1615  *
1616  * Change current working directory (``.'').
1617  *
1618  * MPALMOSTSAFE
1619  */
1620 int
1621 sys_chdir(struct chdir_args *uap)
1622 {
1623         struct nlookupdata nd;
1624         int error;
1625
1626         get_mplock();
1627         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1628         if (error == 0)
1629                 error = kern_chdir(&nd);
1630         nlookup_done(&nd);
1631         rel_mplock();
1632         return (error);
1633 }
1634
1635 /*
1636  * Helper function for raised chroot(2) security function:  Refuse if
1637  * any filedescriptors are open directories.
1638  */
1639 static int
1640 chroot_refuse_vdir_fds(struct filedesc *fdp)
1641 {
1642         struct vnode *vp;
1643         struct file *fp;
1644         int error;
1645         int fd;
1646
1647         for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
1648                 if ((error = holdvnode(fdp, fd, &fp)) != 0)
1649                         continue;
1650                 vp = (struct vnode *)fp->f_data;
1651                 if (vp->v_type != VDIR) {
1652                         fdrop(fp);
1653                         continue;
1654                 }
1655                 fdrop(fp);
1656                 return(EPERM);
1657         }
1658         return (0);
1659 }
1660
1661 /*
1662  * This sysctl determines if we will allow a process to chroot(2) if it
1663  * has a directory open:
1664  *      0: disallowed for all processes.
1665  *      1: allowed for processes that were not already chroot(2)'ed.
1666  *      2: allowed for all processes.
1667  */
1668
1669 static int chroot_allow_open_directories = 1;
1670
1671 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
1672      &chroot_allow_open_directories, 0, "");
1673
1674 /*
1675  * chroot to the specified namecache entry.  We obtain the vp from the
1676  * namecache data.  The passed ncp must be locked and referenced and will
1677  * remain locked and referenced on return.
1678  */
1679 int
1680 kern_chroot(struct nchandle *nch)
1681 {
1682         struct thread *td = curthread;
1683         struct proc *p = td->td_proc;
1684         struct filedesc *fdp = p->p_fd;
1685         struct vnode *vp;
1686         int error;
1687
1688         /*
1689          * Only privileged user can chroot
1690          */
1691         error = priv_check_cred(td->td_ucred, PRIV_VFS_CHROOT, 0);
1692         if (error)
1693                 return (error);
1694
1695         /*
1696          * Disallow open directory descriptors (fchdir() breakouts).
1697          */
1698         if (chroot_allow_open_directories == 0 ||
1699            (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
1700                 if ((error = chroot_refuse_vdir_fds(fdp)) != 0)
1701                         return (error);
1702         }
1703         if ((vp = nch->ncp->nc_vp) == NULL)
1704                 return (ENOENT);
1705
1706         if ((error = vget(vp, LK_SHARED)) != 0)
1707                 return (error);
1708
1709         /*
1710          * Check the validity of vp as a directory to change to and
1711          * associate it with rdir/jdir.
1712          */
1713         error = checkvp_chdir(vp, td);
1714         vn_unlock(vp);                  /* leave reference intact */
1715         if (error == 0) {
1716                 vrele(fdp->fd_rdir);
1717                 fdp->fd_rdir = vp;      /* reference inherited by fd_rdir */
1718                 cache_drop(&fdp->fd_nrdir);
1719                 cache_copy(nch, &fdp->fd_nrdir);
1720                 if (fdp->fd_jdir == NULL) {
1721                         fdp->fd_jdir = vp;
1722                         vref(fdp->fd_jdir);
1723                         cache_copy(nch, &fdp->fd_njdir);
1724                 }
1725         } else {
1726                 vrele(vp);
1727         }
1728         return (error);
1729 }
1730
1731 /*
1732  * chroot_args(char *path)
1733  *
1734  * Change notion of root (``/'') directory.
1735  *
1736  * MPALMOSTSAFE
1737  */
1738 int
1739 sys_chroot(struct chroot_args *uap)
1740 {
1741         struct thread *td = curthread;
1742         struct nlookupdata nd;
1743         int error;
1744
1745         KKASSERT(td->td_proc);
1746         get_mplock();
1747         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1748         if (error == 0) {
1749                 nd.nl_flags |= NLC_EXEC;
1750                 error = nlookup(&nd);
1751                 if (error == 0)
1752                         error = kern_chroot(&nd.nl_nch);
1753         }
1754         nlookup_done(&nd);
1755         rel_mplock();
1756         return(error);
1757 }
1758
1759 /*
1760  * Common routine for chroot and chdir.  Given a locked, referenced vnode,
1761  * determine whether it is legal to chdir to the vnode.  The vnode's state
1762  * is not changed by this call.
1763  */
1764 int
1765 checkvp_chdir(struct vnode *vp, struct thread *td)
1766 {
1767         int error;
1768
1769         if (vp->v_type != VDIR)
1770                 error = ENOTDIR;
1771         else
1772                 error = VOP_EACCESS(vp, VEXEC, td->td_ucred);
1773         return (error);
1774 }
1775
1776 int
1777 kern_open(struct nlookupdata *nd, int oflags, int mode, int *res)
1778 {
1779         struct thread *td = curthread;
1780         struct proc *p = td->td_proc;
1781         struct lwp *lp = td->td_lwp;
1782         struct filedesc *fdp = p->p_fd;
1783         int cmode, flags;
1784         struct file *nfp;
1785         struct file *fp;
1786         struct vnode *vp;
1787         int type, indx, error;
1788         struct flock lf;
1789
1790         if ((oflags & O_ACCMODE) == O_ACCMODE)
1791                 return (EINVAL);
1792         flags = FFLAGS(oflags);
1793         error = falloc(lp, &nfp, NULL);
1794         if (error)
1795                 return (error);
1796         fp = nfp;
1797         cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1798
1799         /*
1800          * XXX p_dupfd is a real mess.  It allows a device to return a
1801          * file descriptor to be duplicated rather then doing the open
1802          * itself.
1803          */
1804         lp->lwp_dupfd = -1;
1805
1806         /*
1807          * Call vn_open() to do the lookup and assign the vnode to the
1808          * file pointer.  vn_open() does not change the ref count on fp
1809          * and the vnode, on success, will be inherited by the file pointer
1810          * and unlocked.
1811          */
1812         nd->nl_flags |= NLC_LOCKVP;
1813         error = vn_open(nd, fp, flags, cmode);
1814         nlookup_done(nd);
1815         if (error) {
1816                 /*
1817                  * handle special fdopen() case.  bleh.  dupfdopen() is
1818                  * responsible for dropping the old contents of ofiles[indx]
1819                  * if it succeeds.
1820                  *
1821                  * Note that fsetfd() will add a ref to fp which represents
1822                  * the fd_files[] assignment.  We must still drop our
1823                  * reference.
1824                  */
1825                 if ((error == ENODEV || error == ENXIO) && lp->lwp_dupfd >= 0) {
1826                         if (fdalloc(p, 0, &indx) == 0) {
1827                                 error = dupfdopen(fdp, indx, lp->lwp_dupfd, flags, error);
1828                                 if (error == 0) {
1829                                         *res = indx;
1830                                         fdrop(fp);      /* our ref */
1831                                         return (0);
1832                                 }
1833                                 fsetfd(fdp, NULL, indx);
1834                         }
1835                 }
1836                 fdrop(fp);      /* our ref */
1837                 if (error == ERESTART)
1838                         error = EINTR;
1839                 return (error);
1840         }
1841
1842         /*
1843          * ref the vnode for ourselves so it can't be ripped out from under
1844          * is.  XXX need an ND flag to request that the vnode be returned
1845          * anyway.
1846          *
1847          * Reserve a file descriptor but do not assign it until the open
1848          * succeeds.
1849          */
1850         vp = (struct vnode *)fp->f_data;
1851         vref(vp);
1852         if ((error = fdalloc(p, 0, &indx)) != 0) {
1853                 fdrop(fp);
1854                 vrele(vp);
1855                 return (error);
1856         }
1857
1858         /*
1859          * If no error occurs the vp will have been assigned to the file
1860          * pointer.
1861          */
1862         lp->lwp_dupfd = 0;
1863
1864         if (flags & (O_EXLOCK | O_SHLOCK)) {
1865                 lf.l_whence = SEEK_SET;
1866                 lf.l_start = 0;
1867                 lf.l_len = 0;
1868                 if (flags & O_EXLOCK)
1869                         lf.l_type = F_WRLCK;
1870                 else
1871                         lf.l_type = F_RDLCK;
1872                 if (flags & FNONBLOCK)
1873                         type = 0;
1874                 else
1875                         type = F_WAIT;
1876
1877                 if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
1878                         /*
1879                          * lock request failed.  Clean up the reserved
1880                          * descriptor.
1881                          */
1882                         vrele(vp);
1883                         fsetfd(fdp, NULL, indx);
1884                         fdrop(fp);
1885                         return (error);
1886                 }
1887                 fp->f_flag |= FHASLOCK;
1888         }
1889 #if 0
1890         /*
1891          * Assert that all regular file vnodes were created with a object.
1892          */
1893         KASSERT(vp->v_type != VREG || vp->v_object != NULL,
1894                 ("open: regular file has no backing object after vn_open"));
1895 #endif
1896
1897         vrele(vp);
1898
1899         /*
1900          * release our private reference, leaving the one associated with the
1901          * descriptor table intact.
1902          */
1903         fsetfd(fdp, fp, indx);
1904         fdrop(fp);
1905         *res = indx;
1906         return (0);
1907 }
1908
1909 /*
1910  * open_args(char *path, int flags, int mode)
1911  *
1912  * Check permissions, allocate an open file structure,
1913  * and call the device open routine if any.
1914  *
1915  * MPALMOSTSAFE
1916  */
1917 int
1918 sys_open(struct open_args *uap)
1919 {
1920         struct nlookupdata nd;
1921         int error;
1922
1923         get_mplock();
1924         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1925         if (error == 0) {
1926                 error = kern_open(&nd, uap->flags,
1927                                     uap->mode, &uap->sysmsg_result);
1928         }
1929         nlookup_done(&nd);
1930         rel_mplock();
1931         return (error);
1932 }
1933
1934 /*
1935  * openat_args(int fd, char *path, int flags, int mode)
1936  *
1937  * MPALMOSTSAFE
1938  */
1939 int
1940 sys_openat(struct openat_args *uap)
1941 {
1942         struct nlookupdata nd;
1943         int error;
1944         struct file *fp;
1945
1946         get_mplock();
1947         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
1948         if (error == 0) {
1949                 error = kern_open(&nd, uap->flags, uap->mode,
1950                                         &uap->sysmsg_result);
1951         }
1952         nlookup_done_at(&nd, fp);
1953         rel_mplock();
1954         return (error);
1955 }
1956
1957 int
1958 kern_mknod(struct nlookupdata *nd, int mode, int rmajor, int rminor)
1959 {
1960         struct thread *td = curthread;
1961         struct proc *p = td->td_proc;
1962         struct vnode *vp;
1963         struct vattr vattr;
1964         int error;
1965         int whiteout = 0;
1966
1967         KKASSERT(p);
1968
1969         VATTR_NULL(&vattr);
1970         vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
1971         vattr.va_rmajor = rmajor;
1972         vattr.va_rminor = rminor;
1973
1974         switch (mode & S_IFMT) {
1975         case S_IFMT:    /* used by badsect to flag bad sectors */
1976                 error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_BAD, 0);
1977                 vattr.va_type = VBAD;
1978                 break;
1979         case S_IFCHR:
1980                 error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1981                 vattr.va_type = VCHR;
1982                 break;
1983         case S_IFBLK:
1984                 error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1985                 vattr.va_type = VBLK;
1986                 break;
1987         case S_IFWHT:
1988                 error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_WHT, 0);
1989                 whiteout = 1;
1990                 break;
1991         case S_IFDIR:   /* special directories support for HAMMER */
1992                 error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_DIR, 0);
1993                 vattr.va_type = VDIR;
1994                 break;
1995         default:
1996                 error = EINVAL;
1997                 break;
1998         }
1999
2000         if (error)
2001                 return (error);
2002
2003         bwillinode(1);
2004         nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2005         if ((error = nlookup(nd)) != 0)
2006                 return (error);
2007         if (nd->nl_nch.ncp->nc_vp)
2008                 return (EEXIST);
2009         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2010                 return (error);
2011
2012         if (whiteout) {
2013                 error = VOP_NWHITEOUT(&nd->nl_nch, nd->nl_dvp,
2014                                       nd->nl_cred, NAMEI_CREATE);
2015         } else {
2016                 vp = NULL;
2017                 error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp,
2018                                    &vp, nd->nl_cred, &vattr);
2019                 if (error == 0)
2020                         vput(vp);
2021         }
2022         return (error);
2023 }
2024
2025 /*
2026  * mknod_args(char *path, int mode, int dev)
2027  *
2028  * Create a special file.
2029  *
2030  * MPALMOSTSAFE
2031  */
2032 int
2033 sys_mknod(struct mknod_args *uap)
2034 {
2035         struct nlookupdata nd;
2036         int error;
2037
2038         get_mplock();
2039         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2040         if (error == 0) {
2041                 error = kern_mknod(&nd, uap->mode,
2042                                    umajor(uap->dev), uminor(uap->dev));
2043         }
2044         nlookup_done(&nd);
2045         rel_mplock();
2046         return (error);
2047 }
2048
2049 int
2050 kern_mkfifo(struct nlookupdata *nd, int mode)
2051 {
2052         struct thread *td = curthread;
2053         struct proc *p = td->td_proc;
2054         struct vattr vattr;
2055         struct vnode *vp;
2056         int error;
2057
2058         bwillinode(1);
2059
2060         nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2061         if ((error = nlookup(nd)) != 0)
2062                 return (error);
2063         if (nd->nl_nch.ncp->nc_vp)
2064                 return (EEXIST);
2065         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2066                 return (error);
2067
2068         VATTR_NULL(&vattr);
2069         vattr.va_type = VFIFO;
2070         vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2071         vp = NULL;
2072         error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp, &vp, nd->nl_cred, &vattr);
2073         if (error == 0)
2074                 vput(vp);
2075         return (error);
2076 }
2077
2078 /*
2079  * mkfifo_args(char *path, int mode)
2080  *
2081  * Create a named pipe.
2082  *
2083  * MPALMOSTSAFE
2084  */
2085 int
2086 sys_mkfifo(struct mkfifo_args *uap)
2087 {
2088         struct nlookupdata nd;
2089         int error;
2090
2091         get_mplock();
2092         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2093         if (error == 0)
2094                 error = kern_mkfifo(&nd, uap->mode);
2095         nlookup_done(&nd);
2096         rel_mplock();
2097         return (error);
2098 }
2099
2100 static int hardlink_check_uid = 0;
2101 SYSCTL_INT(_security, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
2102     &hardlink_check_uid, 0,
2103     "Unprivileged processes cannot create hard links to files owned by other "
2104     "users");
2105 static int hardlink_check_gid = 0;
2106 SYSCTL_INT(_security, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
2107     &hardlink_check_gid, 0,
2108     "Unprivileged processes cannot create hard links to files owned by other "
2109     "groups");
2110
2111 static int
2112 can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
2113 {
2114         struct vattr va;
2115         int error;
2116
2117         /*
2118          * Shortcut if disabled
2119          */
2120         if (hardlink_check_uid == 0 && hardlink_check_gid == 0)
2121                 return (0);
2122
2123         /*
2124          * Privileged user can always hardlink
2125          */
2126         if (priv_check_cred(cred, PRIV_VFS_LINK, 0) == 0)
2127                 return (0);
2128
2129         /*
2130          * Otherwise only if the originating file is owned by the
2131          * same user or group.  Note that any group is allowed if
2132          * the file is owned by the caller.
2133          */
2134         error = VOP_GETATTR(vp, &va);
2135         if (error != 0)
2136                 return (error);
2137
2138         if (hardlink_check_uid) {
2139                 if (cred->cr_uid != va.va_uid)
2140                         return (EPERM);
2141         }
2142
2143         if (hardlink_check_gid) {
2144                 if (cred->cr_uid != va.va_uid && !groupmember(va.va_gid, cred))
2145                         return (EPERM);
2146         }
2147
2148         return (0);
2149 }
2150
2151 int
2152 kern_link(struct nlookupdata *nd, struct nlookupdata *linknd)
2153 {
2154         struct thread *td = curthread;
2155         struct vnode *vp;
2156         int error;
2157
2158         /*
2159          * Lookup the source and obtained a locked vnode.
2160          *
2161          * You may only hardlink a file which you have write permission
2162          * on or which you own.
2163          *
2164          * XXX relookup on vget failure / race ?
2165          */
2166         bwillinode(1);
2167         nd->nl_flags |= NLC_WRITE | NLC_OWN | NLC_HLINK;
2168         if ((error = nlookup(nd)) != 0)
2169                 return (error);
2170         vp = nd->nl_nch.ncp->nc_vp;
2171         KKASSERT(vp != NULL);
2172         if (vp->v_type == VDIR)
2173                 return (EPERM);         /* POSIX */
2174         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2175                 return (error);
2176         if ((error = vget(vp, LK_EXCLUSIVE)) != 0)
2177                 return (error);
2178
2179         /*
2180          * Unlock the source so we can lookup the target without deadlocking
2181          * (XXX vp is locked already, possible other deadlock?).  The target
2182          * must not exist.
2183          */
2184         KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
2185         nd->nl_flags &= ~NLC_NCPISLOCKED;
2186         cache_unlock(&nd->nl_nch);
2187         vn_unlock(vp);
2188
2189         linknd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2190         if ((error = nlookup(linknd)) != 0) {
2191                 vrele(vp);
2192                 return (error);
2193         }
2194         if (linknd->nl_nch.ncp->nc_vp) {
2195                 vrele(vp);
2196                 return (EEXIST);
2197         }
2198         if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) != 0) {
2199                 vrele(vp);
2200                 return (error);
2201         }
2202
2203         /*
2204          * Finally run the new API VOP.
2205          */
2206         error = can_hardlink(vp, td, td->td_ucred);
2207         if (error == 0) {
2208                 error = VOP_NLINK(&linknd->nl_nch, linknd->nl_dvp,
2209                                   vp, linknd->nl_cred);
2210         }
2211         vput(vp);
2212         return (error);
2213 }
2214
2215 /*
2216  * link_args(char *path, char *link)
2217  *
2218  * Make a hard file link.
2219  *
2220  * MPALMOSTSAFE
2221  */
2222 int
2223 sys_link(struct link_args *uap)
2224 {
2225         struct nlookupdata nd, linknd;
2226         int error;
2227
2228         get_mplock();
2229         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2230         if (error == 0) {
2231                 error = nlookup_init(&linknd, uap->link, UIO_USERSPACE, 0);
2232                 if (error == 0)
2233                         error = kern_link(&nd, &linknd);
2234                 nlookup_done(&linknd);
2235         }
2236         nlookup_done(&nd);
2237         rel_mplock();
2238         return (error);
2239 }
2240
2241 int
2242 kern_symlink(struct nlookupdata *nd, char *path, int mode)
2243 {
2244         struct vattr vattr;
2245         struct vnode *vp;
2246         struct vnode *dvp;
2247         int error;
2248
2249         bwillinode(1);
2250         nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2251         if ((error = nlookup(nd)) != 0)
2252                 return (error);
2253         if (nd->nl_nch.ncp->nc_vp)
2254                 return (EEXIST);
2255         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2256                 return (error);
2257         dvp = nd->nl_dvp;
2258         VATTR_NULL(&vattr);
2259         vattr.va_mode = mode;
2260         error = VOP_NSYMLINK(&nd->nl_nch, dvp, &vp, nd->nl_cred, &vattr, path);
2261         if (error == 0)
2262                 vput(vp);
2263         return (error);
2264 }
2265
2266 /*
2267  * symlink(char *path, char *link)
2268  *
2269  * Make a symbolic link.
2270  *
2271  * MPALMOSTSAFE
2272  */
2273 int
2274 sys_symlink(struct symlink_args *uap)
2275 {
2276         struct thread *td = curthread;
2277         struct nlookupdata nd;
2278         char *path;
2279         int error;
2280         int mode;
2281
2282         path = objcache_get(namei_oc, M_WAITOK);
2283         error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
2284         if (error == 0) {
2285                 get_mplock();
2286                 error = nlookup_init(&nd, uap->link, UIO_USERSPACE, 0);
2287                 if (error == 0) {
2288                         mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2289                         error = kern_symlink(&nd, path, mode);
2290                 }
2291                 nlookup_done(&nd);
2292                 rel_mplock();
2293         }
2294         objcache_put(namei_oc, path);
2295         return (error);
2296 }
2297
2298 /*
2299  * undelete_args(char *path)
2300  *
2301  * Delete a whiteout from the filesystem.
2302  *
2303  * MPALMOSTSAFE
2304  */
2305 int
2306 sys_undelete(struct undelete_args *uap)
2307 {
2308         struct nlookupdata nd;
2309         int error;
2310
2311         get_mplock();
2312         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2313         bwillinode(1);
2314         nd.nl_flags |= NLC_DELETE | NLC_REFDVP;
2315         if (error == 0)
2316                 error = nlookup(&nd);
2317         if (error == 0)
2318                 error = ncp_writechk(&nd.nl_nch);
2319         if (error == 0) {
2320                 error = VOP_NWHITEOUT(&nd.nl_nch, nd.nl_dvp, nd.nl_cred,
2321                                       NAMEI_DELETE);
2322         }
2323         nlookup_done(&nd);
2324         rel_mplock();
2325         return (error);
2326 }
2327
2328 int
2329 kern_unlink(struct nlookupdata *nd)
2330 {
2331         int error;
2332
2333         bwillinode(1);
2334         nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
2335         if ((error = nlookup(nd)) != 0)
2336                 return (error);
2337         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2338                 return (error);
2339         error = VOP_NREMOVE(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
2340         return (error);
2341 }
2342
2343 /*
2344  * unlink_args(char *path)
2345  *
2346  * Delete a name from the filesystem.
2347  *
2348  * MPALMOSTSAFE
2349  */
2350 int
2351 sys_unlink(struct unlink_args *uap)
2352 {
2353         struct nlookupdata nd;
2354         int error;
2355
2356         get_mplock();
2357         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2358         if (error == 0)
2359                 error = kern_unlink(&nd);
2360         nlookup_done(&nd);
2361         rel_mplock();
2362         return (error);
2363 }
2364
2365
2366 /*
2367  * unlinkat_args(int fd, char *path, int flags)
2368  *
2369  * Delete the file or directory entry pointed to by fd/path.
2370  *
2371  * MPALMOSTSAFE
2372  */
2373 int
2374 sys_unlinkat(struct unlinkat_args *uap)
2375 {
2376         struct nlookupdata nd;
2377         struct file *fp;
2378         int error;
2379
2380         if (uap->flags & ~AT_REMOVEDIR)
2381                 return (EINVAL);
2382
2383         get_mplock();
2384         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2385         if (error == 0) {
2386                 if (uap->flags & AT_REMOVEDIR)
2387                         error = kern_rmdir(&nd);
2388                 else
2389                         error = kern_unlink(&nd);
2390         }
2391         nlookup_done_at(&nd, fp);
2392         rel_mplock();
2393         return (error);
2394 }
2395
2396 /*
2397  * MPALMOSTSAFE
2398  */
2399 int
2400 kern_lseek(int fd, off_t offset, int whence, off_t *res)
2401 {
2402         struct thread *td = curthread;
2403         struct proc *p = td->td_proc;
2404         struct file *fp;
2405         struct vnode *vp;
2406         struct vattr vattr;
2407         off_t new_offset;
2408         int error;
2409
2410         fp = holdfp(p->p_fd, fd, -1);
2411         if (fp == NULL)
2412                 return (EBADF);
2413         if (fp->f_type != DTYPE_VNODE) {
2414                 error = ESPIPE;
2415                 goto done;
2416         }
2417         vp = (struct vnode *)fp->f_data;
2418
2419         switch (whence) {
2420         case L_INCR:
2421                 spin_lock_wr(&fp->f_spin);
2422                 new_offset = fp->f_offset + offset;
2423                 error = 0;
2424                 break;
2425         case L_XTND:
2426                 get_mplock();
2427                 error = VOP_GETATTR(vp, &vattr);
2428                 rel_mplock();
2429                 spin_lock_wr(&fp->f_spin);
2430                 new_offset = offset + vattr.va_size;
2431                 break;
2432         case L_SET:
2433                 new_offset = offset;
2434                 error = 0;
2435                 spin_lock_wr(&fp->f_spin);
2436                 break;
2437         default:
2438                 new_offset = 0;
2439                 error = EINVAL;
2440                 spin_lock_wr(&fp->f_spin);
2441                 break;
2442         }
2443
2444         /*
2445          * Validate the seek position.  Negative offsets are not allowed
2446          * for regular files or directories.
2447          *
2448          * Normally we would also not want to allow negative offsets for
2449          * character and block-special devices.  However kvm addresses
2450          * on 64 bit architectures might appear to be negative and must
2451          * be allowed.
2452          */
2453         if (error == 0) {
2454                 if (new_offset < 0 &&
2455                     (vp->v_type == VREG || vp->v_type == VDIR)) {
2456                         error = EINVAL;
2457                 } else {
2458                         fp->f_offset = new_offset;
2459                 }
2460         }
2461         *res = fp->f_offset;
2462         spin_unlock_wr(&fp->f_spin);
2463 done:
2464         fdrop(fp);
2465         return (error);
2466 }
2467
2468 /*
2469  * lseek_args(int fd, int pad, off_t offset, int whence)
2470  *
2471  * Reposition read/write file offset.
2472  *
2473  * MPSAFE
2474  */
2475 int
2476 sys_lseek(struct lseek_args *uap)
2477 {
2478         int error;
2479
2480         error = kern_lseek(uap->fd, uap->offset, uap->whence,
2481                            &uap->sysmsg_offset);
2482
2483         return (error);
2484 }
2485
2486 /*
2487  * Check if current process can access given file.  amode is a bitmask of *_OK
2488  * access bits.  flags is a bitmask of AT_* flags.
2489  */
2490 int
2491 kern_access(struct nlookupdata *nd, int amode, int flags)
2492 {
2493         struct vnode *vp;
2494         int error, mode;
2495
2496         if (flags & ~AT_EACCESS)
2497                 return (EINVAL);
2498         if ((error = nlookup(nd)) != 0)
2499                 return (error);
2500 retry:
2501         error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &vp);
2502         if (error)
2503                 return (error);
2504
2505         /* Flags == 0 means only check for existence. */
2506         if (amode) {
2507                 mode = 0;
2508                 if (amode & R_OK)
2509                         mode |= VREAD;
2510                 if (amode & W_OK)
2511                         mode |= VWRITE;
2512                 if (amode & X_OK)
2513                         mode |= VEXEC;
2514                 if ((mode & VWRITE) == 0 ||
2515                     (error = vn_writechk(vp, &nd->nl_nch)) == 0)
2516                         error = VOP_ACCESS_FLAGS(vp, mode, flags, nd->nl_cred);
2517
2518                 /*
2519                  * If the file handle is stale we have to re-resolve the
2520                  * entry.  This is a hack at the moment.
2521                  */
2522                 if (error == ESTALE) {
2523                         vput(vp);
2524                         cache_setunresolved(&nd->nl_nch);
2525                         error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2526                         if (error == 0) {
2527                                 vp = NULL;
2528                                 goto retry;
2529                         }
2530                         return(error);
2531                 }
2532         }
2533         vput(vp);
2534         return (error);
2535 }
2536
2537 /*
2538  * access_args(char *path, int flags)
2539  *
2540  * Check access permissions.
2541  *
2542  * MPALMOSTSAFE
2543  */
2544 int
2545 sys_access(struct access_args *uap)
2546 {
2547         struct nlookupdata nd;
2548         int error;
2549
2550         get_mplock();
2551         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2552         if (error == 0)
2553                 error = kern_access(&nd, uap->flags, 0);
2554         nlookup_done(&nd);
2555         rel_mplock();
2556         return (error);
2557 }
2558
2559
2560 /*
2561  * faccessat_args(int fd, char *path, int amode, int flags)
2562  *
2563  * Check access permissions.
2564  *
2565  * MPALMOSTSAFE
2566  */
2567 int
2568 sys_faccessat(struct faccessat_args *uap)
2569 {
2570         struct nlookupdata nd;
2571         struct file *fp;
2572         int error;
2573
2574         get_mplock();
2575         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE,
2576                                 NLC_FOLLOW);
2577         if (error == 0)
2578                 error = kern_access(&nd, uap->amode, uap->flags);
2579         nlookup_done_at(&nd, fp);
2580         rel_mplock();
2581         return (error);
2582 }
2583
2584
2585 int
2586 kern_stat(struct nlookupdata *nd, struct stat *st)
2587 {
2588         int error;
2589         struct vnode *vp;
2590         thread_t td;
2591
2592         if ((error = nlookup(nd)) != 0)
2593                 return (error);
2594 again:
2595         if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
2596                 return (ENOENT);
2597
2598         td = curthread;
2599         if ((error = vget(vp, LK_SHARED)) != 0)
2600                 return (error);
2601         error = vn_stat(vp, st, nd->nl_cred);
2602
2603         /*
2604          * If the file handle is stale we have to re-resolve the entry.  This
2605          * is a hack at the moment.
2606          */
2607         if (error == ESTALE) {
2608                 vput(vp);
2609                 cache_setunresolved(&nd->nl_nch);
2610                 error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2611                 if (error == 0)
2612                         goto again;
2613         } else {
2614                 vput(vp);
2615         }
2616         return (error);
2617 }
2618
2619 /*
2620  * stat_args(char *path, struct stat *ub)
2621  *
2622  * Get file status; this version follows links.
2623  *
2624  * MPALMOSTSAFE
2625  */
2626 int
2627 sys_stat(struct stat_args *uap)
2628 {
2629         struct nlookupdata nd;
2630         struct stat st;
2631         int error;
2632
2633         get_mplock();
2634         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2635         if (error == 0) {
2636                 error = kern_stat(&nd, &st);
2637                 if (error == 0)
2638                         error = copyout(&st, uap->ub, sizeof(*uap->ub));
2639         }
2640         nlookup_done(&nd);
2641         rel_mplock();
2642         return (error);
2643 }
2644
2645 /*
2646  * lstat_args(char *path, struct stat *ub)
2647  *
2648  * Get file status; this version does not follow links.
2649  *
2650  * MPALMOSTSAFE
2651  */
2652 int
2653 sys_lstat(struct lstat_args *uap)
2654 {
2655         struct nlookupdata nd;
2656         struct stat st;
2657         int error;
2658
2659         get_mplock();
2660         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2661         if (error == 0) {
2662                 error = kern_stat(&nd, &st);
2663                 if (error == 0)
2664                         error = copyout(&st, uap->ub, sizeof(*uap->ub));
2665         }
2666         nlookup_done(&nd);
2667         rel_mplock();
2668         return (error);
2669 }
2670
2671 /*
2672  * fstatat_args(int fd, char *path, struct stat *sb, int flags)
2673  *
2674  * Get status of file pointed to by fd/path.
2675  *
2676  * MPALMOSTSAFE
2677  */
2678 int
2679 sys_fstatat(struct fstatat_args *uap)
2680 {
2681         struct nlookupdata nd;
2682         struct stat st;
2683         int error;
2684         int flags;
2685         struct file *fp;
2686
2687         if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
2688                 return (EINVAL);
2689
2690         flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
2691
2692         get_mplock();
2693         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
2694                                 UIO_USERSPACE, flags);
2695         if (error == 0) {
2696                 error = kern_stat(&nd, &st);
2697                 if (error == 0)
2698                         error = copyout(&st, uap->sb, sizeof(*uap->sb));
2699         }
2700         nlookup_done_at(&nd, fp);
2701         rel_mplock();
2702         return (error);
2703 }
2704
2705 /*
2706  * pathconf_Args(char *path, int name)
2707  *
2708  * Get configurable pathname variables.
2709  *
2710  * MPALMOSTSAFE
2711  */
2712 int
2713 sys_pathconf(struct pathconf_args *uap)
2714 {
2715         struct nlookupdata nd;
2716         struct vnode *vp;
2717         int error;
2718
2719         vp = NULL;
2720         get_mplock();
2721         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2722         if (error == 0)
2723                 error = nlookup(&nd);
2724         if (error == 0)
2725                 error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
2726         nlookup_done(&nd);
2727         if (error == 0) {
2728                 error = VOP_PATHCONF(vp, uap->name, &uap->sysmsg_reg);
2729                 vput(vp);
2730         }
2731         rel_mplock();
2732         return (error);
2733 }
2734
2735 /*
2736  * XXX: daver
2737  * kern_readlink isn't properly split yet.  There is a copyin burried
2738  * in VOP_READLINK().
2739  */
2740 int
2741 kern_readlink(struct nlookupdata *nd, char *buf, int count, int *res)
2742 {
2743         struct thread *td = curthread;
2744         struct vnode *vp;
2745         struct iovec aiov;
2746         struct uio auio;
2747         int error;
2748
2749         if ((error = nlookup(nd)) != 0)
2750                 return (error);
2751         error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &vp);
2752         if (error)
2753                 return (error);
2754         if (vp->v_type != VLNK) {
2755                 error = EINVAL;
2756         } else {
2757                 aiov.iov_base = buf;
2758                 aiov.iov_len = count;
2759                 auio.uio_iov = &aiov;
2760                 auio.uio_iovcnt = 1;
2761                 auio.uio_offset = 0;
2762                 auio.uio_rw = UIO_READ;
2763                 auio.uio_segflg = UIO_USERSPACE;
2764                 auio.uio_td = td;
2765                 auio.uio_resid = count;
2766                 error = VOP_READLINK(vp, &auio, td->td_ucred);
2767         }
2768         vput(vp);
2769         *res = count - auio.uio_resid;
2770         return (error);
2771 }
2772
2773 /*
2774  * readlink_args(char *path, char *buf, int count)
2775  *
2776  * Return target name of a symbolic link.
2777  *
2778  * MPALMOSTSAFE
2779  */
2780 int
2781 sys_readlink(struct readlink_args *uap)
2782 {
2783         struct nlookupdata nd;
2784         int error;
2785
2786         get_mplock();
2787         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2788         if (error == 0) {
2789                 error = kern_readlink(&nd, uap->buf, uap->count,
2790                                         &uap->sysmsg_result);
2791         }
2792         nlookup_done(&nd);
2793         rel_mplock();
2794         return (error);
2795 }
2796
2797 static int
2798 setfflags(struct vnode *vp, int flags)
2799 {
2800         struct thread *td = curthread;
2801         int error;
2802         struct vattr vattr;
2803
2804         /*
2805          * Prevent non-root users from setting flags on devices.  When
2806          * a device is reused, users can retain ownership of the device
2807          * if they are allowed to set flags and programs assume that
2808          * chown can't fail when done as root.
2809          */
2810         if ((vp->v_type == VCHR || vp->v_type == VBLK) &&
2811             ((error = priv_check_cred(td->td_ucred, PRIV_VFS_CHFLAGS_DEV, 0)) != 0))
2812                 return (error);
2813
2814         /*
2815          * note: vget is required for any operation that might mod the vnode
2816          * so VINACTIVE is properly cleared.
2817          */
2818         if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2819                 VATTR_NULL(&vattr);
2820                 vattr.va_flags = flags;
2821                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2822                 vput(vp);
2823         }
2824         return (error);
2825 }
2826
2827 /*
2828  * chflags(char *path, int flags)
2829  *
2830  * Change flags of a file given a path name.
2831  *
2832  * MPALMOSTSAFE
2833  */
2834 int
2835 sys_chflags(struct chflags_args *uap)
2836 {
2837         struct nlookupdata nd;
2838         struct vnode *vp;
2839         int error;
2840
2841         vp = NULL;
2842         get_mplock();
2843         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2844         if (error == 0)
2845                 error = nlookup(&nd);
2846         if (error == 0)
2847                 error = ncp_writechk(&nd.nl_nch);
2848         if (error == 0)
2849                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
2850         nlookup_done(&nd);
2851         if (error == 0) {
2852                 error = setfflags(vp, uap->flags);
2853                 vrele(vp);
2854         }
2855         rel_mplock();
2856         return (error);
2857 }
2858
2859 /*
2860  * lchflags(char *path, int flags)
2861  *
2862  * Change flags of a file given a path name, but don't follow symlinks.
2863  *
2864  * MPALMOSTSAFE
2865  */
2866 int
2867 sys_lchflags(struct lchflags_args *uap)
2868 {
2869         struct nlookupdata nd;
2870         struct vnode *vp;
2871         int error;
2872
2873         vp = NULL;
2874         get_mplock();
2875         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2876         if (error == 0)
2877                 error = nlookup(&nd);
2878         if (error == 0)
2879                 error = ncp_writechk(&nd.nl_nch);
2880         if (error == 0)
2881                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
2882         nlookup_done(&nd);
2883         if (error == 0) {
2884                 error = setfflags(vp, uap->flags);
2885                 vrele(vp);
2886         }
2887         rel_mplock();
2888         return (error);
2889 }
2890
2891 /*
2892  * fchflags_args(int fd, int flags)
2893  *
2894  * Change flags of a file given a file descriptor.
2895  *
2896  * MPALMOSTSAFE
2897  */
2898 int
2899 sys_fchflags(struct fchflags_args *uap)
2900 {
2901         struct thread *td = curthread;
2902         struct proc *p = td->td_proc;
2903         struct file *fp;
2904         int error;
2905
2906         if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2907                 return (error);
2908         get_mplock();
2909         if (fp->f_nchandle.ncp)
2910                 error = ncp_writechk(&fp->f_nchandle);
2911         if (error == 0)
2912                 error = setfflags((struct vnode *) fp->f_data, uap->flags);
2913         rel_mplock();
2914         fdrop(fp);
2915         return (error);
2916 }
2917
2918 static int
2919 setfmode(struct vnode *vp, int mode)
2920 {
2921         struct thread *td = curthread;
2922         int error;
2923         struct vattr vattr;
2924
2925         /*
2926          * note: vget is required for any operation that might mod the vnode
2927          * so VINACTIVE is properly cleared.
2928          */
2929         if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2930                 VATTR_NULL(&vattr);
2931                 vattr.va_mode = mode & ALLPERMS;
2932                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2933                 vput(vp);
2934         }
2935         return error;
2936 }
2937
2938 int
2939 kern_chmod(struct nlookupdata *nd, int mode)
2940 {
2941         struct vnode *vp;
2942         int error;
2943
2944         if ((error = nlookup(nd)) != 0)
2945                 return (error);
2946         if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2947                 return (error);
2948         if ((error = ncp_writechk(&nd->nl_nch)) == 0)
2949                 error = setfmode(vp, mode);
2950         vrele(vp);
2951         return (error);
2952 }
2953
2954 /*
2955  * chmod_args(char *path, int mode)
2956  *
2957  * Change mode of a file given path name.
2958  *
2959  * MPALMOSTSAFE
2960  */
2961 int
2962 sys_chmod(struct chmod_args *uap)
2963 {
2964         struct nlookupdata nd;
2965         int error;
2966
2967         get_mplock();
2968         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2969         if (error == 0)
2970                 error = kern_chmod(&nd, uap->mode);
2971         nlookup_done(&nd);
2972         rel_mplock();
2973         return (error);
2974 }
2975
2976 /*
2977  * lchmod_args(char *path, int mode)
2978  *
2979  * Change mode of a file given path name (don't follow links.)
2980  *
2981  * MPALMOSTSAFE
2982  */
2983 int
2984 sys_lchmod(struct lchmod_args *uap)
2985 {
2986         struct nlookupdata nd;
2987         int error;
2988
2989         get_mplock();
2990         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2991         if (error == 0)
2992                 error = kern_chmod(&nd, uap->mode);
2993         nlookup_done(&nd);
2994         rel_mplock();
2995         return (error);
2996 }
2997
2998 /*
2999  * fchmod_args(int fd, int mode)
3000  *
3001  * Change mode of a file given a file descriptor.
3002  *
3003  * MPALMOSTSAFE
3004  */
3005 int
3006 sys_fchmod(struct fchmod_args *uap)
3007 {
3008         struct thread *td = curthread;
3009         struct proc *p = td->td_proc;
3010         struct file *fp;
3011         int error;
3012
3013         if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3014                 return (error);
3015         get_mplock();
3016         if (fp->f_nchandle.ncp)
3017                 error = ncp_writechk(&fp->f_nchandle);
3018         if (error == 0)
3019                 error = setfmode((struct vnode *)fp->f_data, uap->mode);
3020         rel_mplock();
3021         fdrop(fp);
3022         return (error);
3023 }
3024
3025 /*
3026  * fchmodat_args(char *path, int mode)
3027  *
3028  * Change mode of a file pointed to by fd/path.
3029  *
3030  * MPALMOSTSAFE
3031  */
3032 int
3033 sys_fchmodat(struct fchmodat_args *uap)
3034 {
3035         struct nlookupdata nd;
3036         struct file *fp;
3037         int error;
3038         int flags;
3039
3040         if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3041                 return (EINVAL);
3042         flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3043
3044         get_mplock();
3045         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3046                                 UIO_USERSPACE, flags);
3047         if (error == 0)
3048                 error = kern_chmod(&nd, uap->mode);
3049         nlookup_done_at(&nd, fp);
3050         rel_mplock();
3051         return (error);
3052 }
3053
3054 static int
3055 setfown(struct vnode *vp, uid_t uid, gid_t gid)
3056 {
3057         struct thread *td = curthread;
3058         int error;
3059         struct vattr vattr;
3060
3061         /*
3062          * note: vget is required for any operation that might mod the vnode
3063          * so VINACTIVE is properly cleared.
3064          */
3065         if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3066                 VATTR_NULL(&vattr);
3067                 vattr.va_uid = uid;
3068                 vattr.va_gid = gid;
3069                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3070                 vput(vp);
3071         }
3072         return error;
3073 }
3074
3075 int
3076 kern_chown(struct nlookupdata *nd, int uid, int gid)
3077 {
3078         struct vnode *vp;
3079         int error;
3080
3081         if ((error = nlookup(nd)) != 0)
3082                 return (error);
3083         if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3084                 return (error);
3085         if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3086                 error = setfown(vp, uid, gid);
3087         vrele(vp);
3088         return (error);
3089 }
3090
3091 /*
3092  * chown(char *path, int uid, int gid)
3093  *
3094  * Set ownership given a path name.
3095  *
3096  * MPALMOSTSAFE
3097  */
3098 int
3099 sys_chown(struct chown_args *uap)
3100 {
3101         struct nlookupdata nd;
3102         int error;
3103
3104         get_mplock();
3105         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3106         if (error == 0)
3107                 error = kern_chown(&nd, uap->uid, uap->gid);
3108         nlookup_done(&nd);
3109         rel_mplock();
3110         return (error);
3111 }
3112
3113 /*
3114  * lchown_args(char *path, int uid, int gid)
3115  *
3116  * Set ownership given a path name, do not cross symlinks.
3117  *
3118  * MPALMOSTSAFE
3119  */
3120 int
3121 sys_lchown(struct lchown_args *uap)
3122 {
3123         struct nlookupdata nd;
3124         int error;
3125
3126         get_mplock();
3127         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3128         if (error == 0)
3129                 error = kern_chown(&nd, uap->uid, uap->gid);
3130         nlookup_done(&nd);
3131         rel_mplock();
3132         return (error);
3133 }
3134
3135 /*
3136  * fchown_args(int fd, int uid, int gid)
3137  *
3138  * Set ownership given a file descriptor.
3139  *
3140  * MPALMOSTSAFE
3141  */
3142 int
3143 sys_fchown(struct fchown_args *uap)
3144 {
3145         struct thread *td = curthread;
3146         struct proc *p = td->td_proc;
3147         struct file *fp;
3148         int error;
3149
3150         if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3151                 return (error);
3152         get_mplock();
3153         if (fp->f_nchandle.ncp)
3154                 error = ncp_writechk(&fp->f_nchandle);
3155         if (error == 0)
3156                 error = setfown((struct vnode *)fp->f_data, uap->uid, uap->gid);
3157         rel_mplock();
3158         fdrop(fp);
3159         return (error);
3160 }
3161
3162 /*
3163  * fchownat(int fd, char *path, int uid, int gid, int flags)
3164  *
3165  * Set ownership of file pointed to by fd/path.
3166  *
3167  * MPALMOSTSAFE
3168  */
3169 int
3170 sys_fchownat(struct fchownat_args *uap)
3171 {
3172         struct nlookupdata nd;
3173         struct file *fp;
3174         int error;
3175         int flags;
3176
3177         if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3178                 return (EINVAL);
3179         flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3180
3181         get_mplock();
3182         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3183                                 UIO_USERSPACE, flags);
3184         if (error == 0)
3185                 error = kern_chown(&nd, uap->uid, uap->gid);
3186         nlookup_done_at(&nd, fp);
3187         rel_mplock();
3188         return (error);
3189 }
3190
3191
3192 static int
3193 getutimes(const struct timeval *tvp, struct timespec *tsp)
3194 {
3195         struct timeval tv[2];
3196
3197         if (tvp == NULL) {
3198                 microtime(&tv[0]);
3199                 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
3200                 tsp[1] = tsp[0];
3201         } else {
3202                 TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3203                 TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3204         }
3205         return 0;
3206 }
3207
3208 static int
3209 setutimes(struct vnode *vp, struct vattr *vattr,
3210           const struct timespec *ts, int nullflag)
3211 {
3212         struct thread *td = curthread;
3213         int error;
3214
3215         VATTR_NULL(vattr);
3216         vattr->va_atime = ts[0];
3217         vattr->va_mtime = ts[1];
3218         if (nullflag)
3219                 vattr->va_vaflags |= VA_UTIMES_NULL;
3220         error = VOP_SETATTR(vp, vattr, td->td_ucred);
3221
3222         return error;
3223 }
3224
3225 int
3226 kern_utimes(struct nlookupdata *nd, struct timeval *tptr)
3227 {
3228         struct timespec ts[2];
3229         struct vnode *vp;
3230         struct vattr vattr;
3231         int error;
3232
3233         if ((error = getutimes(tptr, ts)) != 0)
3234                 return (error);
3235
3236         /*
3237          * NOTE: utimes() succeeds for the owner even if the file
3238          * is not user-writable.
3239          */
3240         nd->nl_flags |= NLC_OWN | NLC_WRITE;
3241
3242         if ((error = nlookup(nd)) != 0)
3243                 return (error);
3244         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3245                 return (error);
3246         if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3247                 return (error);
3248
3249         /*
3250          * note: vget is required for any operation that might mod the vnode
3251          * so VINACTIVE is properly cleared.
3252          */
3253         if ((error = vn_writechk(vp, &nd->nl_nch)) == 0) {
3254                 error = vget(vp, LK_EXCLUSIVE);
3255                 if (error == 0) {
3256                         error = setutimes(vp, &vattr, ts, (tptr == NULL));
3257                         vput(vp);
3258                 }
3259         }
3260         vrele(vp);
3261         return (error);
3262 }
3263
3264 /*
3265  * utimes_args(char *path, struct timeval *tptr)
3266  *
3267  * Set the access and modification times of a file.
3268  *
3269  * MPALMOSTSAFE
3270  */
3271 int
3272 sys_utimes(struct utimes_args *uap)
3273 {
3274         struct timeval tv[2];
3275         struct nlookupdata nd;
3276         int error;
3277
3278         if (uap->tptr) {
3279                 error = copyin(uap->tptr, tv, sizeof(tv));
3280                 if (error)
3281                         return (error);
3282         }
3283         get_mplock();
3284         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3285         if (error == 0)
3286                 error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3287         nlookup_done(&nd);
3288         rel_mplock();
3289         return (error);
3290 }
3291
3292 /*
3293  * lutimes_args(char *path, struct timeval *tptr)
3294  *
3295  * Set the access and modification times of a file.
3296  *
3297  * MPALMOSTSAFE
3298  */
3299 int
3300 sys_lutimes(struct lutimes_args *uap)
3301 {
3302         struct timeval tv[2];
3303         struct nlookupdata nd;
3304         int error;
3305
3306         if (uap->tptr) {
3307                 error = copyin(uap->tptr, tv, sizeof(tv));
3308                 if (error)
3309                         return (error);
3310         }
3311         get_mplock();
3312         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3313         if (error == 0)
3314                 error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3315         nlookup_done(&nd);
3316         rel_mplock();
3317         return (error);
3318 }
3319
3320 /*
3321  * Set utimes on a file descriptor.  The creds used to open the
3322  * file are used to determine whether the operation is allowed
3323  * or not.
3324  */
3325 int
3326 kern_futimes(int fd, struct timeval *tptr)
3327 {
3328         struct thread *td = curthread;
3329         struct proc *p = td->td_proc;
3330         struct timespec ts[2];
3331         struct file *fp;
3332         struct vnode *vp;
3333         struct vattr vattr;
3334         int error;
3335
3336         error = getutimes(tptr, ts);
3337         if (error)
3338                 return (error);
3339         if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3340                 return (error);
3341         if (fp->f_nchandle.ncp)
3342                 error = ncp_writechk(&fp->f_nchandle);
3343         if (error == 0) {
3344                 vp = fp->f_data;
3345                 error = vget(vp, LK_EXCLUSIVE);
3346                 if (error == 0) {
3347                         error = VOP_GETATTR(vp, &vattr);
3348                         if (error == 0) {
3349                                 error = naccess_va(&vattr, NLC_OWN | NLC_WRITE,
3350                                                    fp->f_cred);
3351                         }
3352                         if (error == 0) {
3353                                 error = setutimes(vp, &vattr, ts,
3354                                                   (tptr == NULL));
3355                         }
3356                         vput(vp);
3357                 }
3358         }
3359         fdrop(fp);
3360         return (error);
3361 }
3362
3363 /*
3364  * futimes_args(int fd, struct timeval *tptr)
3365  *
3366  * Set the access and modification times of a file.
3367  *
3368  * MPALMOSTSAFE
3369  */
3370 int
3371 sys_futimes(struct futimes_args *uap)
3372 {
3373         struct timeval tv[2];
3374         int error;
3375
3376         if (uap->tptr) {
3377                 error = copyin(uap->tptr, tv, sizeof(tv));
3378                 if (error)
3379                         return (error);
3380         }
3381         get_mplock();
3382         error = kern_futimes(uap->fd, uap->tptr ? tv : NULL);
3383         rel_mplock();
3384
3385         return (error);
3386 }
3387
3388 int
3389 kern_truncate(struct nlookupdata *nd, off_t length)
3390 {
3391         struct vnode *vp;
3392         struct vattr vattr;
3393         int error;
3394
3395         if (length < 0)
3396                 return(EINVAL);
3397         nd->nl_flags |= NLC_WRITE | NLC_TRUNCATE;
3398         if ((error = nlookup(nd)) != 0)
3399                 return (error);
3400         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3401                 return (error);
3402         if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3403                 return (error);
3404         if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) != 0) {
3405                 vrele(vp);
3406                 return (error);
3407         }
3408         if (vp->v_type == VDIR) {
3409                 error = EISDIR;
3410         } else if ((error = vn_writechk(vp, &nd->nl_nch)) == 0) {
3411                 VATTR_NULL(&vattr);
3412                 vattr.va_size = length;
3413                 error = VOP_SETATTR(vp, &vattr, nd->nl_cred);
3414         }
3415         vput(vp);
3416         return (error);
3417 }
3418
3419 /*
3420  * truncate(char *path, int pad, off_t length)
3421  *
3422  * Truncate a file given its path name.
3423  *
3424  * MPALMOSTSAFE
3425  */
3426 int
3427 sys_truncate(struct truncate_args *uap)
3428 {
3429         struct nlookupdata nd;
3430         int error;
3431
3432         get_mplock();
3433         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3434         if (error == 0)
3435                 error = kern_truncate(&nd, uap->length);
3436         nlookup_done(&nd);
3437         rel_mplock();
3438         return error;
3439 }
3440
3441 int
3442 kern_ftruncate(int fd, off_t length)
3443 {
3444         struct thread *td = curthread;
3445         struct proc *p = td->td_proc;
3446         struct vattr vattr;
3447         struct vnode *vp;
3448         struct file *fp;
3449         int error;
3450
3451         if (length < 0)
3452                 return(EINVAL);
3453         if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3454                 return (error);
3455         if (fp->f_nchandle.ncp) {
3456                 error = ncp_writechk(&fp->f_nchandle);
3457                 if (error)
3458                         goto done;
3459         }
3460         if ((fp->f_flag & FWRITE) == 0) {
3461                 error = EINVAL;
3462                 goto done;
3463         }
3464         if (fp->f_flag & FAPPENDONLY) { /* inode was set s/uapnd */
3465                 error = EINVAL;
3466                 goto done;
3467         }
3468         vp = (struct vnode *)fp->f_data;
3469         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3470         if (vp->v_type == VDIR) {
3471                 error = EISDIR;
3472         } else if ((error = vn_writechk(vp, NULL)) == 0) {
3473                 VATTR_NULL(&vattr);
3474                 vattr.va_size = length;
3475                 error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3476         }
3477         vn_unlock(vp);
3478 done:
3479         fdrop(fp);
3480         return (error);
3481 }
3482
3483 /*
3484  * ftruncate_args(int fd, int pad, off_t length)
3485  *
3486  * Truncate a file given a file descriptor.
3487  *
3488  * MPALMOSTSAFE
3489  */
3490 int
3491 sys_ftruncate(struct ftruncate_args *uap)
3492 {
3493         int error;
3494
3495         get_mplock();
3496         error = kern_ftruncate(uap->fd, uap->length);
3497         rel_mplock();
3498
3499         return (error);
3500 }
3501
3502 /*
3503  * fsync(int fd)
3504  *
3505  * Sync an open file.
3506  *
3507  * MPALMOSTSAFE
3508  */
3509 int
3510 sys_fsync(struct fsync_args *uap)
3511 {
3512         struct thread *td = curthread;
3513         struct proc *p = td->td_proc;
3514         struct vnode *vp;
3515         struct file *fp;
3516         vm_object_t obj;
3517         int error;
3518
3519         if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3520                 return (error);
3521         get_mplock();
3522         vp = (struct vnode *)fp->f_data;
3523         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3524         if ((obj = vp->v_object) != NULL)
3525                 vm_object_page_clean(obj, 0, 0, 0);
3526         error = VOP_FSYNC(vp, MNT_WAIT, VOP_FSYNC_SYSCALL);
3527         if (error == 0 && vp->v_mount)
3528                 error = buf_fsync(vp);
3529         vn_unlock(vp);
3530         rel_mplock();
3531         fdrop(fp);
3532
3533         return (error);
3534 }
3535
3536 int
3537 kern_rename(struct nlookupdata *fromnd, struct nlookupdata *tond)
3538 {
3539         struct nchandle fnchd;
3540         struct nchandle tnchd;
3541         struct namecache *ncp;
3542         struct vnode *fdvp;
3543         struct vnode *tdvp;
3544         struct mount *mp;
3545         int error;
3546
3547         bwillinode(1);
3548         fromnd->nl_flags |= NLC_REFDVP | NLC_RENAME_SRC;
3549         if ((error = nlookup(fromnd)) != 0)
3550                 return (error);
3551         if ((fnchd.ncp = fromnd->nl_nch.ncp->nc_parent) == NULL)
3552                 return (ENOENT);
3553         fnchd.mount = fromnd->nl_nch.mount;
3554         cache_hold(&fnchd);
3555
3556         /*
3557          * unlock the source nch so we can lookup the target nch without
3558          * deadlocking.  The target may or may not exist so we do not check
3559          * for a target vp like kern_mkdir() and other creation functions do.
3560          *
3561          * The source and target directories are ref'd and rechecked after
3562          * everything is relocked to determine if the source or target file
3563          * has been renamed.
3564          */
3565         KKASSERT(fromnd->nl_flags & NLC_NCPISLOCKED);
3566         fromnd->nl_flags &= ~NLC_NCPISLOCKED;
3567         cache_unlock(&fromnd->nl_nch);
3568
3569         tond->nl_flags |= NLC_RENAME_DST | NLC_REFDVP;
3570         if ((error = nlookup(tond)) != 0) {
3571                 cache_drop(&fnchd);
3572                 return (error);
3573         }
3574         if ((tnchd.ncp = tond->nl_nch.ncp->nc_parent) == NULL) {
3575                 cache_drop(&fnchd);
3576                 return (ENOENT);
3577         }
3578         tnchd.mount = tond->nl_nch.mount;
3579         cache_hold(&tnchd);
3580
3581         /*
3582          * If the source and target are the same there is nothing to do
3583          */
3584         if (fromnd->nl_nch.ncp == tond->nl_nch.ncp) {
3585                 cache_drop(&fnchd);
3586                 cache_drop(&tnchd);
3587                 return (0);
3588         }
3589
3590         /*
3591          * Mount points cannot be renamed or overwritten
3592          */
3593         if ((fromnd->nl_nch.ncp->nc_flag | tond->nl_nch.ncp->nc_flag) &
3594             NCF_ISMOUNTPT
3595         ) {
3596                 cache_drop(&fnchd);
3597                 cache_drop(&tnchd);
3598                 return (EINVAL);
3599         }
3600
3601         /*
3602          * relock the source ncp.  NOTE AFTER RELOCKING: the source ncp
3603          * may have become invalid while it was unlocked, nc_vp and nc_mount
3604          * could be NULL.
3605          */
3606         if (cache_lock_nonblock(&fromnd->nl_nch) == 0) {
3607                 cache_resolve(&fromnd->nl_nch, fromnd->nl_cred);
3608         } else if (fromnd->nl_nch.ncp > tond->nl_nch.ncp) {
3609                 cache_lock(&fromnd->nl_nch);
3610                 cache_resolve(&fromnd->nl_nch, fromnd->nl_cred);
3611         } else {
3612                 cache_unlock(&tond->nl_nch);
3613                 cache_lock(&fromnd->nl_nch);
3614                 cache_resolve(&fromnd->nl_nch, fromnd->nl_cred);
3615                 cache_lock(&tond->nl_nch);
3616                 cache_resolve(&tond->nl_nch, tond->nl_cred);
3617         }
3618         fromnd->nl_flags |= NLC_NCPISLOCKED;
3619
3620         /*
3621          * make sure the parent directories linkages are the same
3622          */
3623         if (fnchd.ncp != fromnd->nl_nch.ncp->nc_parent ||
3624             tnchd.ncp != tond->nl_nch.ncp->nc_parent) {
3625                 cache_drop(&fnchd);
3626                 cache_drop(&tnchd);
3627                 return (ENOENT);
3628         }
3629
3630         /*
3631          * Both the source and target must be within the same filesystem and
3632          * in the same filesystem as their parent directories within the
3633          * namecache topology.
3634          *
3635          * NOTE: fromnd's nc_mount or nc_vp could be NULL.
3636          */
3637         mp = fnchd.mount;
3638         if (mp != tnchd.mount || mp != fromnd->nl_nch.mount ||
3639             mp != tond->nl_nch.mount) {
3640                 cache_drop(&fnchd);
3641                 cache_drop(&tnchd);
3642                 return (EXDEV);
3643         }
3644
3645         /*
3646          * Make sure the mount point is writable
3647          */
3648         if ((error = ncp_writechk(&tond->nl_nch)) != 0) {
3649                 cache_drop(&fnchd);
3650                 cache_drop(&tnchd);
3651                 return (error);
3652         }
3653
3654         /*
3655          * If the target exists and either the source or target is a directory,
3656          * then both must be directories.
3657          *
3658          * Due to relocking of the source, fromnd->nl_nch.ncp->nc_vp might h
3659          * have become NULL.
3660          */
3661         if (tond->nl_nch.ncp->nc_vp) {
3662                 if (fromnd->nl_nch.ncp->nc_vp == NULL) {
3663                         error = ENOENT;
3664                 } else if (fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
3665                         if (tond->nl_nch.ncp->nc_vp->v_type != VDIR)
3666                                 error = ENOTDIR;
3667                 } else if (tond->nl_nch.ncp->nc_vp->v_type == VDIR) {
3668                         error = EISDIR;
3669                 }
3670         }
3671
3672         /*
3673          * You cannot rename a source into itself or a subdirectory of itself.
3674          * We check this by travsersing the target directory upwards looking
3675          * for a match against the source.
3676          */
3677         if (error == 0) {
3678                 for (ncp = tnchd.ncp; ncp; ncp = ncp->nc_parent) {
3679                         if (fromnd->nl_nch.ncp == ncp) {
3680                                 error = EINVAL;
3681                                 break;
3682                         }
3683                 }
3684         }
3685
3686         cache_drop(&fnchd);
3687         cache_drop(&tnchd);
3688
3689         /*
3690          * Even though the namespaces are different, they may still represent
3691          * hardlinks to the same file.  The filesystem might have a hard time
3692          * with this so we issue a NREMOVE of the source instead of a NRENAME
3693          * when we detect the situation.
3694          */
3695         if (error == 0) {
3696                 fdvp = fromnd->nl_dvp;
3697                 tdvp = tond->nl_dvp;
3698                 if (fdvp == NULL || tdvp == NULL) {
3699                         error = EPERM;
3700                 } else if (fromnd->nl_nch.ncp->nc_vp == tond->nl_nch.ncp->nc_vp) {
3701                         error = VOP_NREMOVE(&fromnd->nl_nch, fdvp,
3702                                             fromnd->nl_cred);
3703                 } else {
3704                         error = VOP_NRENAME(&fromnd->nl_nch, &tond->nl_nch,
3705                                             fdvp, tdvp, tond->nl_cred);
3706                 }
3707         }
3708         return (error);
3709 }
3710
3711 /*
3712  * rename_args(char *from, char *to)
3713  *
3714  * Rename files.  Source and destination must either both be directories,
3715  * or both not be directories.  If target is a directory, it must be empty.
3716  *
3717  * MPALMOSTSAFE
3718  */
3719 int
3720 sys_rename(struct rename_args *uap)
3721 {
3722         struct nlookupdata fromnd, tond;
3723         int error;
3724
3725         get_mplock();
3726         error = nlookup_init(&fromnd, uap->from, UIO_USERSPACE, 0);
3727         if (error == 0) {
3728                 error = nlookup_init(&tond, uap->to, UIO_USERSPACE, 0);
3729                 if (error == 0)
3730                         error = kern_rename(&fromnd, &tond);
3731                 nlookup_done(&tond);
3732         }
3733         nlookup_done(&fromnd);
3734         rel_mplock();
3735         return (error);
3736 }
3737
3738 int
3739 kern_mkdir(struct nlookupdata *nd, int mode)
3740 {
3741         struct thread *td = curthread;
3742         struct proc *p = td->td_proc;
3743         struct vnode *vp;
3744         struct vattr vattr;
3745         int error;
3746
3747         bwillinode(1);
3748         nd->nl_flags |= NLC_WILLBEDIR | NLC_CREATE | NLC_REFDVP;
3749         if ((error = nlookup(nd)) != 0)
3750                 return (error);
3751
3752         if (nd->nl_nch.ncp->nc_vp)
3753                 return (EEXIST);
3754         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3755                 return (error);
3756         VATTR_NULL(&vattr);
3757         vattr.va_type = VDIR;
3758         vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask;
3759
3760         vp = NULL;
3761         error = VOP_NMKDIR(&nd->nl_nch, nd->nl_dvp, &vp, td->td_ucred, &vattr);
3762         if (error == 0)
3763                 vput(vp);
3764         return (error);
3765 }
3766
3767 /*
3768  * mkdir_args(char *path, int mode)
3769  *
3770  * Make a directory file.
3771  *
3772  * MPALMOSTSAFE
3773  */
3774 int
3775 sys_mkdir(struct mkdir_args *uap)
3776 {
3777         struct nlookupdata nd;
3778         int error;
3779
3780         get_mplock();
3781         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3782         if (error == 0)
3783                 error = kern_mkdir(&nd, uap->mode);
3784         nlookup_done(&nd);
3785         rel_mplock();
3786         return (error);
3787 }
3788
3789 int
3790 kern_rmdir(struct nlookupdata *nd)
3791 {
3792         int error;
3793
3794         bwillinode(1);
3795         nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
3796         if ((error = nlookup(nd)) != 0)
3797                 return (error);
3798
3799         /*
3800          * Do not allow directories representing mount points to be
3801          * deleted, even if empty.  Check write perms on mount point
3802          * in case the vnode is aliased (aka nullfs).
3803          */
3804         if (nd->nl_nch.ncp->nc_flag & (NCF_ISMOUNTPT))
3805                 return (EINVAL);
3806         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3807                 return (error);
3808         error = VOP_NRMDIR(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
3809         return (error);
3810 }
3811
3812 /*
3813  * rmdir_args(char *path)
3814  *
3815  * Remove a directory file.
3816  *
3817  * MPALMOSTSAFE
3818  */
3819 int
3820 sys_rmdir(struct rmdir_args *uap)
3821 {
3822         struct nlookupdata nd;
3823         int error;
3824
3825         get_mplock();
3826         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3827         if (error == 0)
3828                 error = kern_rmdir(&nd);
3829         nlookup_done(&nd);
3830         rel_mplock();
3831         return (error);
3832 }
3833
3834 int
3835 kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res,
3836     enum uio_seg direction)
3837 {
3838         struct thread *td = curthread;
3839         struct proc *p = td->td_proc;
3840         struct vnode *vp;
3841         struct file *fp;
3842         struct uio auio;
3843         struct iovec aiov;
3844         off_t loff;
3845         int error, eofflag;
3846
3847         if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3848                 return (error);
3849         if ((fp->f_flag & FREAD) == 0) {
3850                 error = EBADF;
3851                 goto done;
3852         }
3853         vp = (struct vnode *)fp->f_data;
3854 unionread:
3855         if (vp->v_type != VDIR) {
3856                 error = EINVAL;
3857                 goto done;
3858         }
3859         aiov.iov_base = buf;
3860         aiov.iov_len = count;
3861         auio.uio_iov = &aiov;
3862         auio.uio_iovcnt = 1;
3863         auio.uio_rw = UIO_READ;
3864         auio.uio_segflg = direction;
3865         auio.uio_td = td;
3866         auio.uio_resid = count;
3867         loff = auio.uio_offset = fp->f_offset;
3868         error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
3869         fp->f_offset = auio.uio_offset;
3870         if (error)
3871                 goto done;
3872         if (count == auio.uio_resid) {
3873                 if (union_dircheckp) {
3874                         error = union_dircheckp(td, &vp, fp);
3875                         if (error == -1)
3876                                 goto unionread;
3877                         if (error)
3878                                 goto done;
3879                 }
3880 #if 0
3881                 if ((vp->v_flag & VROOT) &&
3882                     (vp->v_mount->mnt_flag & MNT_UNION)) {
3883                         struct vnode *tvp = vp;
3884                         vp = vp->v_mount->mnt_vnodecovered;
3885                         vref(vp);
3886                         fp->f_data = vp;
3887                         fp->f_offset = 0;
3888                         vrele(tvp);
3889                         goto unionread;
3890                 }
3891 #endif
3892         }
3893
3894         /*
3895          * WARNING!  *basep may not be wide enough to accomodate the
3896          * seek offset.   XXX should we hack this to return the upper 32 bits
3897          * for offsets greater then 4G?
3898          */
3899         if (basep) {
3900                 *basep = (long)loff;
3901         }
3902         *res = count - auio.uio_resid;
3903 done:
3904         fdrop(fp);
3905         return (error);
3906 }
3907
3908 /*
3909  * getdirentries_args(int fd, char *buf, u_int conut, long *basep)
3910  *
3911  * Read a block of directory entries in a file system independent format.
3912  *
3913  * MPALMOSTSAFE
3914  */
3915 int
3916 sys_getdirentries(struct getdirentries_args *uap)
3917 {
3918         long base;
3919         int error;
3920
3921         get_mplock();
3922         error = kern_getdirentries(uap->fd, uap->buf, uap->count, &base,
3923                                    &uap->sysmsg_result, UIO_USERSPACE);
3924         rel_mplock();
3925
3926         if (error == 0 && uap->basep)
3927                 error = copyout(&base, uap->basep, sizeof(*uap->basep));
3928         return (error);
3929 }
3930
3931 /*
3932  * getdents_args(int fd, char *buf, size_t count)
3933  *
3934  * MPALMOSTSAFE
3935  */
3936 int
3937 sys_getdents(struct getdents_args *uap)
3938 {
3939         int error;
3940
3941         get_mplock();
3942         error = kern_getdirentries(uap->fd, uap->buf, uap->count, NULL,
3943                                    &uap->sysmsg_result, UIO_USERSPACE);
3944         rel_mplock();
3945
3946         return (error);
3947 }
3948
3949 /*
3950  * Set the mode mask for creation of filesystem nodes.
3951  *
3952  * umask(int newmask)
3953  *
3954  * MPSAFE
3955  */
3956 int
3957 sys_umask(struct umask_args *uap)
3958 {
3959         struct thread *td = curthread;
3960         struct proc *p = td->td_proc;
3961         struct filedesc *fdp;
3962
3963         fdp = p->p_fd;
3964         uap->sysmsg_result = fdp->fd_cmask;
3965         fdp->fd_cmask = uap->newmask & ALLPERMS;
3966         return (0);
3967 }
3968
3969 /*
3970  * revoke(char *path)
3971  *
3972  * Void all references to file by ripping underlying filesystem
3973  * away from vnode.
3974  *
3975  * MPALMOSTSAFE
3976  */
3977 int
3978 sys_revoke(struct revoke_args *uap)
3979 {
3980         struct nlookupdata nd;
3981         struct vattr vattr;
3982         struct vnode *vp;
3983         struct ucred *cred;
3984         int error;
3985
3986         vp = NULL;
3987         get_mplock();
3988         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3989         if (error == 0)
3990                 error = nlookup(&nd);
3991         if (error == 0)
3992                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3993         cred = crhold(nd.nl_cred);
3994         nlookup_done(&nd);
3995         if (error == 0) {
3996                 if (error == 0)
3997                         error = VOP_GETATTR(vp, &vattr);
3998                 if (error == 0 && cred->cr_uid != vattr.va_uid)
3999                         error = priv_check_cred(cred, PRIV_VFS_REVOKE, 0);
4000                 if (error == 0 && (vp->v_type == VCHR || vp->v_type == VBLK)) {
4001                         if (vcount(vp) > 0)
4002                                 error = vrevoke(vp, cred);
4003                 } else if (error == 0) {
4004                         error = vrevoke(vp, cred);
4005                 }
4006                 vrele(vp);
4007         }
4008         if (cred)
4009                 crfree(cred);
4010         rel_mplock();
4011         return (error);
4012 }
4013
4014 /*
4015  * getfh_args(char *fname, fhandle_t *fhp)
4016  *
4017  * Get (NFS) file handle
4018  *
4019  * NOTE: We use the fsid of the covering mount, even if it is a nullfs
4020  * mount.  This allows nullfs mounts to be explicitly exported.
4021  *
4022  * WARNING: nullfs mounts of HAMMER PFS ROOTs are safe.
4023  *
4024  *          nullfs mounts of subdirectories are not safe.  That is, it will
4025  *          work, but you do not really have protection against access to
4026  *          the related parent directories.
4027  *
4028  * MPALMOSTSAFE
4029  */
4030 int
4031 sys_getfh(struct getfh_args *uap)
4032 {
4033         struct thread *td = curthread;
4034         struct nlookupdata nd;
4035         fhandle_t fh;
4036         struct vnode *vp;
4037         struct mount *mp;
4038         int error;
4039
4040         /*
4041          * Must be super user
4042          */
4043         if ((error = priv_check(td, PRIV_ROOT)) != 0)
4044                 return (error);
4045
4046         vp = NULL;
4047         get_mplock();
4048         error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
4049         if (error == 0)
4050                 error = nlookup(&nd);
4051         if (error == 0)
4052                 error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4053         mp = nd.nl_nch.mount;
4054         nlookup_done(&nd);
4055         if (error == 0) {
4056                 bzero(&fh, sizeof(fh));
4057                 fh.fh_fsid = mp->mnt_stat.f_fsid;
4058                 error = VFS_VPTOFH(vp, &fh.fh_fid);
4059                 vput(vp);
4060                 if (error == 0)
4061                         error = copyout(&fh, uap->fhp, sizeof(fh));
4062         }
4063         rel_mplock();
4064         return (error);
4065 }
4066
4067 /*
4068  * fhopen_args(const struct fhandle *u_fhp, int flags)
4069  *
4070  * syscall for the rpc.lockd to use to translate a NFS file handle into
4071  * an open descriptor.
4072  *
4073  * warning: do not remove the priv_check() call or this becomes one giant
4074  * security hole.
4075  *
4076  * MPALMOSTSAFE
4077  */
4078 int
4079 sys_fhopen(struct fhopen_args *uap)
4080 {
4081         struct thread *td = curthread;
4082         struct filedesc *fdp = td->td_proc->p_fd;
4083         struct mount *mp;
4084         struct vnode *vp;
4085         struct fhandle fhp;
4086         struct vattr vat;
4087         struct vattr *vap = &vat;
4088         struct flock lf;
4089         int fmode, mode, error, type;
4090         struct file *nfp;
4091         struct file *fp;
4092         int indx;
4093
4094         /*
4095          * Must be super user
4096          */
4097         error = priv_check(td, PRIV_ROOT);
4098         if (error)
4099                 return (error);
4100
4101         fmode = FFLAGS(uap->flags);
4102
4103         /*
4104          * Why not allow a non-read/write open for our lockd?
4105          */
4106         if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4107                 return (EINVAL);
4108         error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4109         if (error)
4110                 return(error);
4111
4112         /*
4113          * Find the mount point
4114          */
4115         get_mplock();
4116         mp = vfs_getvfs(&fhp.fh_fsid);
4117         if (mp == NULL) {
4118                 error = ESTALE;
4119                 goto  done;
4120         }
4121         /* now give me my vnode, it gets returned to me locked */
4122         error = VFS_FHTOVP(mp, NULL, &fhp.fh_fid, &vp);
4123         if (error)
4124                 goto done;
4125         /*
4126          * from now on we have to make sure not
4127          * to forget about the vnode
4128          * any error that causes an abort must vput(vp)
4129          * just set error = err and 'goto bad;'.
4130          */
4131
4132         /*
4133          * from vn_open
4134          */
4135         if (vp->v_type == VLNK) {
4136                 error = EMLINK;
4137                 goto bad;
4138         }
4139         if (vp->v_type == VSOCK) {
4140                 error = EOPNOTSUPP;
4141                 goto bad;
4142         }
4143         mode = 0;
4144         if (fmode & (FWRITE | O_TRUNC)) {
4145                 if (vp->v_type == VDIR) {
4146                         error = EISDIR;
4147                         goto bad;
4148                 }
4149                 error = vn_writechk(vp, NULL);
4150                 if (error)
4151                         goto bad;
4152                 mode |= VWRITE;
4153         }
4154         if (fmode & FREAD)
4155                 mode |= VREAD;
4156         if (mode) {
4157                 error = VOP_ACCESS(vp, mode, td->td_ucred);
4158                 if (error)
4159                         goto bad;
4160         }
4161         if (fmode & O_TRUNC) {
4162                 vn_unlock(vp);                          /* XXX */
4163                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
4164                 VATTR_NULL(vap);
4165                 vap->va_size = 0;
4166                 error = VOP_SETATTR(vp, vap, td->td_ucred);
4167                 if (error)
4168                         goto bad;
4169         }
4170
4171         /*
4172          * VOP_OPEN needs the file pointer so it can potentially override
4173          * it.
4174          *
4175          * WARNING! no f_nchandle will be associated when fhopen()ing a
4176          * directory.  XXX
4177          */
4178         if ((error = falloc(td->td_lwp, &nfp, &indx)) != 0)
4179                 goto bad;
4180         fp = nfp;
4181
4182         error = VOP_OPEN(vp, fmode, td->td_ucred, fp);
4183         if (error) {
4184                 /*
4185                  * setting f_ops this way prevents VOP_CLOSE from being
4186                  * called or fdrop() releasing the vp from v_data.   Since
4187                  * the VOP_OPEN failed we don't want to VOP_CLOSE.
4188                  */
4189                 fp->f_ops = &badfileops;
4190                 fp->f_data = NULL;
4191                 goto bad_drop;
4192         }
4193
4194         /*
4195          * The fp is given its own reference, we still have our ref and lock.
4196          *
4197          * Assert that all regular files must be created with a VM object.
4198          */
4199         if (vp->v_type == VREG && vp->v_object == NULL) {
4200                 kprintf("fhopen: regular file did not have VM object: %p\n", vp);
4201                 goto bad_drop;
4202         }
4203
4204         /*
4205          * The open was successful.  Handle any locking requirements.
4206          */
4207         if (fmode & (O_EXLOCK | O_SHLOCK)) {
4208                 lf.l_whence = SEEK_SET;
4209                 lf.l_start = 0;
4210                 lf.l_len = 0;
4211                 if (fmode & O_EXLOCK)
4212                         lf.l_type = F_WRLCK;
4213                 else
4214                         lf.l_type = F_RDLCK;
4215                 if (fmode & FNONBLOCK)
4216                         type = 0;
4217                 else
4218                         type = F_WAIT;
4219                 vn_unlock(vp);
4220                 if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
4221                         /*
4222                          * release our private reference.
4223                          */
4224                         fsetfd(fdp, NULL, indx);
4225                         fdrop(fp);
4226                         vrele(vp);
4227                         goto done;
4228                 }
4229                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4230                 fp->f_flag |= FHASLOCK;
4231         }
4232
4233         /*
4234          * Clean up.  Associate the file pointer with the previously
4235          * reserved descriptor and return it.
4236          */
4237         vput(vp);
4238         rel_mplock();
4239         fsetfd(fdp, fp, indx);
4240         fdrop(fp);
4241         uap->sysmsg_result = indx;
4242         return (0);
4243
4244 bad_drop:
4245         fsetfd(fdp, NULL, indx);
4246         fdrop(fp);
4247 bad:
4248         vput(vp);
4249 done:
4250         rel_mplock();
4251         return (error);
4252 }
4253
4254 /*
4255  * fhstat_args(struct fhandle *u_fhp, struct stat *sb)
4256  *
4257  * MPALMOSTSAFE
4258  */
4259 int
4260 sys_fhstat(struct fhstat_args *uap)
4261 {
4262         struct thread *td = curthread;
4263         struct stat sb;
4264         fhandle_t fh;
4265         struct mount *mp;
4266         struct vnode *vp;
4267         int error;
4268
4269         /*
4270          * Must be super user
4271          */
4272         error = priv_check(td, PRIV_ROOT);
4273         if (error)
4274                 return (error);
4275
4276         error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4277         if (error)
4278                 return (error);
4279
4280         get_mplock();
4281         if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
4282                 error = ESTALE;
4283         if (error == 0) {
4284                 if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) == 0) {
4285                         error = vn_stat(vp, &sb, td->td_ucred);
4286                         vput(vp);
4287                 }
4288         }
4289         rel_mplock();
4290         if (error == 0)
4291                 error = copyout(&sb, uap->sb, sizeof(sb));
4292         return (error);
4293 }
4294
4295 /*
4296  * fhstatfs_args(struct fhandle *u_fhp, struct statfs *buf)
4297  *
4298  * MPALMOSTSAFE
4299  */
4300 int
4301 sys_fhstatfs(struct fhstatfs_args *uap)
4302 {
4303         struct thread *td = curthread;
4304         struct proc *p = td->td_proc;
4305         struct statfs *sp;
4306         struct mount *mp;
4307         struct vnode *vp;
4308         struct statfs sb;
4309         char *fullpath, *freepath;
4310         fhandle_t fh;
4311         int error;
4312
4313         /*
4314          * Must be super user
4315          */
4316         if ((error = priv_check(td, PRIV_ROOT)))
4317                 return (error);
4318
4319         if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
4320                 return (error);
4321
4322         get_mplock();
4323
4324         if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
4325                 error = ESTALE;
4326                 goto done;
4327         }
4328         if (p != NULL && !chroot_visible_mnt(mp, p)) {
4329                 error = ESTALE;
4330                 goto done;
4331         }
4332
4333         if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) != 0)
4334                 goto done;
4335         mp = vp->v_mount;
4336         sp = &mp->mnt_stat;
4337         vput(vp);
4338         if ((error = VFS_STATFS(mp, sp, td->td_ucred)) != 0)
4339                 goto done;
4340
4341         error = mount_path(p, mp, &fullpath, &freepath);
4342         if (error)
4343                 goto done;
4344         bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
4345         strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
4346         kfree(freepath, M_TEMP);
4347
4348         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4349         if (priv_check(td, PRIV_ROOT)) {
4350                 bcopy(sp, &sb, sizeof(sb));
4351                 sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
4352                 sp = &sb;
4353         }
4354         error = copyout(sp, uap->buf, sizeof(*sp));
4355 done:
4356         rel_mplock();
4357         return (error);
4358 }
4359
4360 /*
4361  * fhstatvfs_args(struct fhandle *u_fhp, struct statvfs *buf)
4362  *
4363  * MPALMOSTSAFE
4364  */
4365 int
4366 sys_fhstatvfs(struct fhstatvfs_args *uap)
4367 {
4368         struct thread *td = curthread;
4369         struct proc *p = td->td_proc;
4370         struct statvfs *sp;
4371         struct mount *mp;
4372         struct vnode *vp;
4373         fhandle_t fh;
4374         int error;
4375
4376         /*
4377          * Must be super user
4378          */
4379         if ((error = priv_check(td, PRIV_ROOT)))
4380                 return (error);
4381
4382         if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
4383                 return (error);
4384
4385         get_mplock();
4386
4387         if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
4388                 error = ESTALE;
4389                 goto done;
4390         }
4391         if (p != NULL && !chroot_visible_mnt(mp, p)) {
4392                 error = ESTALE;
4393                 goto done;
4394         }
4395
4396         if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)))
4397                 goto done;
4398         mp = vp->v_mount;
4399         sp = &mp->mnt_vstat;
4400         vput(vp);
4401         if ((error = VFS_STATVFS(mp, sp, td->td_ucred)) != 0)
4402                 goto done;
4403
4404         sp->f_flag = 0;
4405         if (mp->mnt_flag & MNT_RDONLY)
4406                 sp->f_flag |= ST_RDONLY;
4407         if (mp->mnt_flag & MNT_NOSUID)
4408                 sp->f_flag |= ST_NOSUID;
4409         error = copyout(sp, uap->buf, sizeof(*sp));
4410 done:
4411         rel_mplock();
4412         return (error);
4413 }
4414
4415
4416 /*
4417  * Syscall to push extended attribute configuration information into the
4418  * VFS.  Accepts a path, which it converts to a mountpoint, as well as
4419  * a command (int cmd), and attribute name and misc data.  For now, the
4420  * attribute name is left in userspace for consumption by the VFS_op.
4421  * It will probably be changed to be copied into sysspace by the
4422  * syscall in the future, once issues with various consumers of the
4423  * attribute code have raised their hands.
4424  *
4425  * Currently this is used only by UFS Extended Attributes.
4426  *
4427  * MPALMOSTSAFE
4428  */
4429 int
4430 sys_extattrctl(struct extattrctl_args *uap)
4431 {
4432         struct nlookupdata nd;
4433         struct mount *mp;
4434         struct vnode *vp;
4435         int error;
4436
4437         vp = NULL;
4438         get_mplock();
4439         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4440         if (error == 0)
4441                 error = nlookup(&nd);
4442         if (error == 0) {
4443                 mp = nd.nl_nch.mount;
4444                 error = VFS_EXTATTRCTL(mp, uap->cmd,
4445                                 uap->attrname, uap->arg,
4446                                 nd.nl_cred);
4447         }
4448         nlookup_done(&nd);
4449         rel_mplock();
4450
4451         return (error);
4452 }
4453
4454 /*
4455  * Syscall to set a named extended attribute on a file or directory.
4456  * Accepts attribute name, and a uio structure pointing to the data to set.
4457  * The uio is consumed in the style of writev().  The real work happens
4458  * in VOP_SETEXTATTR().
4459  *
4460  * MPALMOSTSAFE
4461  */
4462 int
4463 sys_extattr_set_file(struct extattr_set_file_args *uap)
4464 {
4465         char attrname[EXTATTR_MAXNAMELEN];
4466         struct iovec aiov[UIO_SMALLIOV];
4467         struct iovec *needfree;
4468         struct nlookupdata nd;
4469         struct iovec *iov;
4470         struct vnode *vp;
4471         struct uio auio;
4472         u_int iovlen;
4473         u_int cnt;
4474         int error;
4475         int i;
4476
4477         error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
4478         if (error)
4479                 return (error);
4480
4481         vp = NULL;
4482         get_mplock();
4483         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4484         if (error == 0)
4485                 error = nlookup(&nd);
4486         if (error == 0)
4487                 error = ncp_writechk(&nd.nl_nch);
4488         if (error == 0)
4489                 error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4490         if (error) {
4491                 nlookup_done(&nd);
4492                 rel_mplock();
4493                 return (error);
4494         }
4495
4496         needfree = NULL;
4497         iovlen = uap->iovcnt * sizeof(struct iovec);
4498         if (uap->iovcnt > UIO_SMALLIOV) {
4499                 if (uap->iovcnt > UIO_MAXIOV) {
4500                         error = EINVAL;
4501                         goto done;
4502                 }
4503                 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
4504                 needfree = iov;
4505         } else {
4506                 iov = aiov;
4507         }
4508         auio.uio_iov = iov;
4509         auio.uio_iovcnt = uap->iovcnt;
4510         auio.uio_rw = UIO_WRITE;
4511         auio.uio_segflg = UIO_USERSPACE;
4512         auio.uio_td = nd.nl_td;
4513         auio.uio_offset = 0;
4514         if ((error = copyin(uap->iovp, iov, iovlen)))
4515                 goto done;
4516         auio.uio_resid = 0;
4517         for (i = 0; i < uap->iovcnt; i++) {
4518                 if (iov->iov_len > LONG_MAX - auio.uio_resid) {
4519                         error = EINVAL;
4520                         goto done;
4521                 }
4522                 auio.uio_resid += iov->iov_len;
4523                 iov++;
4524         }
4525         cnt = auio.uio_resid;
4526         error = VOP_SETEXTATTR(vp, attrname, &auio, nd.nl_cred);
4527         cnt -= auio.uio_resid;
4528         uap->sysmsg_result = cnt;
4529 done:
4530         vput(vp);
4531         nlookup_done(&nd);
4532         rel_mplock();
4533         if (needfree)
4534                 FREE(needfree, M_IOV);
4535         return (error);
4536 }
4537
4538 /*
4539  * Syscall to get a named extended attribute on a file or directory.
4540  * Accepts attribute name, and a uio structure pointing to a buffer for the
4541  * data.  The uio is consumed in the style of readv().  The real work
4542  * happens in VOP_GETEXTATTR();
4543  *
4544  * MPALMOSTSAFE
4545  */
4546 int
4547 sys_extattr_get_file(struct extattr_get_file_args *uap)
4548 {
4549         char attrname[EXTATTR_MAXNAMELEN];
4550         struct iovec aiov[UIO_SMALLIOV];
4551         struct iovec *needfree;
4552         struct nlookupdata nd;
4553         struct iovec *iov;
4554         struct vnode *vp;
4555         struct uio auio;
4556         u_int iovlen;
4557         u_int cnt;
4558         int error;
4559         int i;
4560
4561         error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
4562         if (error)
4563                 return (error);
4564
4565         vp = NULL;
4566         get_mplock();
4567         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4568         if (error == 0)
4569                 error = nlookup(&nd);
4570         if (error == 0)
4571                 error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4572         if (error) {
4573                 nlookup_done(&nd);
4574                 rel_mplock();
4575                 return (error);
4576         }
4577
4578         iovlen = uap->iovcnt * sizeof (struct iovec);
4579         needfree = NULL;
4580         if (uap->iovcnt > UIO_SMALLIOV) {
4581                 if (uap->iovcnt > UIO_MAXIOV) {
4582                         error = EINVAL;
4583                         goto done;
4584                 }
4585                 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
4586                 needfree = iov;
4587         } else {
4588                 iov = aiov;
4589         }
4590         auio.uio_iov = iov;
4591         auio.uio_iovcnt = uap->iovcnt;
4592         auio.uio_rw = UIO_READ;
4593         auio.uio_segflg = UIO_USERSPACE;
4594         auio.uio_td = nd.nl_td;
4595         auio.uio_offset = 0;
4596         if ((error = copyin(uap->iovp, iov, iovlen)))
4597                 goto done;
4598         auio.uio_resid = 0;
4599         for (i = 0; i < uap->iovcnt; i++) {
4600                 if (iov->iov_len > LONG_MAX - auio.uio_resid) {
4601                         error = EINVAL;
4602                         goto done;
4603                 }
4604                 auio.uio_resid += iov->iov_len;
4605                 iov++;
4606         }
4607         cnt = auio.uio_resid;
4608         error = VOP_GETEXTATTR(vp, attrname, &auio, nd.nl_cred);
4609         cnt -= auio.uio_resid;
4610         uap->sysmsg_result = cnt;
4611 done:
4612         vput(vp);
4613         nlookup_done(&nd);
4614         rel_mplock();
4615         if (needfree)
4616                 FREE(needfree, M_IOV);
4617         return(error);
4618 }
4619
4620 /*
4621  * Syscall to delete a named extended attribute from a file or directory.
4622  * Accepts attribute name.  The real work happens in VOP_SETEXTATTR().
4623  *
4624  * MPALMOSTSAFE
4625  */
4626 int
4627 sys_extattr_delete_file(struct extattr_delete_file_args *uap)
4628 {
4629         char attrname[EXTATTR_MAXNAMELEN];
4630         struct nlookupdata nd;
4631         struct vnode *vp;
4632         int error;
4633
4634         error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
4635         if (error)
4636                 return(error);
4637
4638         get_mplock();
4639         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4640         if (error == 0)
4641                 error = nlookup(&nd);
4642         if (error == 0)
4643                 error = ncp_writechk(&nd.nl_nch);
4644         if (error == 0) {
4645                 error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4646                 if (error == 0) {
4647                         error = VOP_SETEXTATTR(vp, attrname, NULL, nd.nl_cred);
4648                         vput(vp);
4649                 }
4650         }
4651         nlookup_done(&nd);
4652         rel_mplock();
4653         return(error);
4654 }
4655
4656 /*
4657  * Determine if the mount is visible to the process.
4658  */
4659 static int
4660 chroot_visible_mnt(struct mount *mp, struct proc *p)
4661 {
4662         struct nchandle nch;
4663
4664         /*
4665          * Traverse from the mount point upwards.  If we hit the process
4666          * root then the mount point is visible to the process.
4667          */
4668         nch = mp->mnt_ncmountpt;
4669         while (nch.ncp) {
4670                 if (nch.mount == p->p_fd->fd_nrdir.mount &&
4671                     nch.ncp == p->p_fd->fd_nrdir.ncp) {
4672                         return(1);
4673                 }
4674                 if (nch.ncp == nch.mount->mnt_ncmountpt.ncp) {
4675                         nch = nch.mount->mnt_ncmounton;
4676                 } else {
4677                         nch.ncp = nch.ncp->nc_parent;
4678                 }
4679         }
4680
4681         /*
4682          * If the mount point is not visible to the process, but the
4683          * process root is in a subdirectory of the mount, return
4684          * TRUE anyway.
4685          */
4686         if (p->p_fd->fd_nrdir.mount == mp)
4687                 return(1);
4688
4689         return(0);
4690 }
4691