From 5b287bbaa260a30f5f0e6a075ea2c4e2ac2ec80a Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 24 May 2006 03:23:35 +0000 Subject: [PATCH] spinlock more of the file descriptor code. No appreciable difference in performance on buildworld tests. Change getvnode() to holdvnode() and use semantics similar to holdsock(). The old getvnode() code wasn't fhold()ing the file pointer. The new holdvnode() code does. --- sys/emulation/linux/linux_file.c | 31 +- sys/emulation/linux/linux_stats.c | 14 +- sys/kern/kern_acl.c | 31 +- sys/kern/kern_descrip.c | 623 +++++++++++++++++++++--------- sys/kern/kern_slaballoc.c | 20 +- sys/kern/subr_kcore.c | 5 +- sys/kern/syscalls.master | 4 +- sys/kern/vfs_cache.c | 12 +- sys/kern/vfs_lock.c | 18 +- sys/kern/vfs_syscalls.c | 107 +++-- sys/sys/filedesc.h | 5 +- sys/vfs/fdesc/fdesc_vnops.c | 9 +- 12 files changed, 599 insertions(+), 280 deletions(-) diff --git a/sys/emulation/linux/linux_file.c b/sys/emulation/linux/linux_file.c index 912cf287b3..da2ac88275 100644 --- a/sys/emulation/linux/linux_file.c +++ b/sys/emulation/linux/linux_file.c @@ -26,7 +26,7 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD: src/sys/compat/linux/linux_file.c,v 1.41.2.6 2003/01/06 09:19:43 fjoe Exp $ - * $DragonFly: src/sys/emulation/linux/linux_file.c,v 1.29 2006/05/19 07:33:43 dillon Exp $ + * $DragonFly: src/sys/emulation/linux/linux_file.c,v 1.30 2006/05/24 03:23:30 dillon Exp $ */ #include "opt_compat.h" @@ -261,28 +261,35 @@ getdents_common(struct linux_getdents64_args *args, int is64bit) KKASSERT(p); - if ((error = getvnode(p->p_fd, args->fd, &fp)) != 0) + if ((error = holdvnode(p->p_fd, args->fd, &fp)) != 0) return (error); - if ((fp->f_flag & FREAD) == 0) - return (EBADF); + if ((fp->f_flag & FREAD) == 0) { + error = EBADF; + goto done; + } vp = (struct vnode *) fp->f_data; - if (vp->v_type != VDIR) - return (EINVAL); + if (vp->v_type != VDIR) { + error = EINVAL; + goto done; + } - if ((error = VOP_GETATTR(vp, &va))) - return (error); + if ((error = VOP_GETATTR(vp, &va)) != 0) + goto done; nbytes = args->count; if (nbytes == 1) { /* readdir(2) case. Always struct dirent. */ - if (is64bit) - return (EINVAL); + if (is64bit) { + error = EINVAL; + goto done; + } nbytes = sizeof(linux_dirent); justone = 1; - } else + } else { justone = 0; + } off = fp->f_offset; @@ -430,6 +437,8 @@ out: VOP_UNLOCK(vp, 0); free(buf, M_TEMP); +done: + fdrop(fp); return (error); } diff --git a/sys/emulation/linux/linux_stats.c b/sys/emulation/linux/linux_stats.c index 57537bdf15..b089c0599d 100644 --- a/sys/emulation/linux/linux_stats.c +++ b/sys/emulation/linux/linux_stats.c @@ -26,7 +26,7 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD: src/sys/compat/linux/linux_stats.c,v 1.22.2.3 2001/11/05 19:08:23 marcel Exp $ - * $DragonFly: src/sys/emulation/linux/linux_stats.c,v 1.18 2006/05/06 18:48:50 dillon Exp $ + * $DragonFly: src/sys/emulation/linux/linux_stats.c,v 1.19 2006/05/24 03:23:30 dillon Exp $ */ #include @@ -274,12 +274,12 @@ linux_fstatfs(struct linux_fstatfs_args *args) if (ldebug(fstatfs)) printf(ARGS(fstatfs, "%d, *"), args->fd); #endif - error = kern_fstatfs(args->fd, &statfs); - - if (error == 0) - error = getvnode(p->p_fd, args->fd, &fp); - if (error == 0) - error = vn_get_namelen((struct vnode *)fp->f_data, &namelen); + if ((error = kern_fstatfs(args->fd, &statfs)) != 0) + return (error); + if ((error = holdvnode(p->p_fd, args->fd, &fp)) != 0) + return (error); + error = vn_get_namelen((struct vnode *)fp->f_data, &namelen); + fdrop(fp); if (error == 0) error = statfs_copyout(&statfs, args->buf, (l_int)namelen); return (error); diff --git a/sys/kern/kern_acl.c b/sys/kern/kern_acl.c index 488f4895cd..6fb5c045c0 100644 --- a/sys/kern/kern_acl.c +++ b/sys/kern/kern_acl.c @@ -24,7 +24,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/kern/kern_acl.c,v 1.2.2.1 2000/07/28 18:48:16 rwatson Exp $ - * $DragonFly: src/sys/kern/kern_acl.c,v 1.12 2006/05/06 02:43:12 dillon Exp $ + * $DragonFly: src/sys/kern/kern_acl.c,v 1.13 2006/05/24 03:23:31 dillon Exp $ */ /* @@ -207,11 +207,11 @@ __acl_get_fd(struct __acl_get_fd_args *uap) int error; KKASSERT(td->td_proc); - error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); - if (error) + if ((error = holdvnode(td->td_proc->p_fd, uap->filedes, &fp)) != 0) return(error); - return vacl_get_acl((struct vnode *)fp->f_data, uap->type, - uap->aclp); + error = vacl_get_acl((struct vnode *)fp->f_data, uap->type, uap->aclp); + fdrop(fp); + return (error); } /* @@ -225,11 +225,11 @@ __acl_set_fd(struct __acl_set_fd_args *uap) int error; KKASSERT(td->td_proc); - error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); - if (error) + if ((error = holdvnode(td->td_proc->p_fd, uap->filedes, &fp)) != 0) return(error); - return vacl_set_acl((struct vnode *)fp->f_data, uap->type, - uap->aclp); + error = vacl_set_acl((struct vnode *)fp->f_data, uap->type, uap->aclp); + fdrop(fp); + return (error); } /* @@ -268,10 +268,10 @@ __acl_delete_fd(struct __acl_delete_fd_args *uap) int error; KKASSERT(td->td_proc); - error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); - if (error) + if ((error = holdvnode(td->td_proc->p_fd, uap->filedes, &fp)) != 0) return(error); error = vacl_delete((struct vnode *)fp->f_data, uap->type); + fdrop(fp); return (error); } @@ -311,9 +311,10 @@ __acl_aclcheck_fd(struct __acl_aclcheck_fd_args *uap) int error; KKASSERT(td->td_proc); - error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); - if (error) + if ((error = holdvnode(td->td_proc->p_fd, uap->filedes, &fp)) != 0) return(error); - return vacl_aclcheck((struct vnode *)fp->f_data, uap->type, - uap->aclp); + error = vacl_aclcheck((struct vnode *)fp->f_data, uap->type, uap->aclp); + fdrop(fp); + return (error); } + diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 4360e59761..93fdf8cebd 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -70,7 +70,7 @@ * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 * $FreeBSD: src/sys/kern/kern_descrip.c,v 1.81.2.19 2004/02/28 00:43:31 tegge Exp $ - * $DragonFly: src/sys/kern/kern_descrip.c,v 1.61 2006/05/22 21:33:11 dillon Exp $ + * $DragonFly: src/sys/kern/kern_descrip.c,v 1.62 2006/05/24 03:23:31 dillon Exp $ */ #include "opt_compat.h" @@ -103,9 +103,11 @@ #include #include -static void fdreserve (struct filedesc *fdp, int fd0, int incr); -static struct file *funsetfd (struct filedesc *fdp, int fd); +static void fsetfd_locked(struct filedesc *fdp, struct file *fp, int fd); +static void fdreserve_locked (struct filedesc *fdp, int fd0, int incr); +static struct file *funsetfd_locked (struct filedesc *fdp, int fd); static int checkfpclosed(struct filedesc *fdp, int fd, struct file *fp); +static void ffree(struct file *fp); static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table"); static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "file desc to leader", @@ -155,10 +157,12 @@ extern int cmask; /* * Fixup fd_freefile and fd_lastfile after a descriptor has been cleared. + * + * MPSAFE - must be called with fdp->fd_spin exclusively held */ static __inline void -fdfixup(struct filedesc *fdp, int fd) +fdfixup_locked(struct filedesc *fdp, int fd) { if (fd < fdp->fd_freefile) { fdp->fd_freefile = fd; @@ -173,15 +177,19 @@ fdfixup(struct filedesc *fdp, int fd) /* * System calls on descriptors. + * + * MPSAFE */ -/* ARGSUSED */ int getdtablesize(struct getdtablesize_args *uap) { struct proc *p = curproc; + struct plimit *limit = p->p_limit; + spin_lock_rd(&limit->p_spin); uap->sysmsg_result = - min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); + min((int)limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); + spin_unlock_rd(&limit->p_spin); return (0); } @@ -190,8 +198,9 @@ getdtablesize(struct getdtablesize_args *uap) * * note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). + * + * MPSAFE */ -/* ARGSUSED */ int dup2(struct dup2_args *uap) { @@ -204,8 +213,9 @@ dup2(struct dup2_args *uap) /* * Duplicate a file descriptor. + * + * MPSAFE */ -/* ARGSUSED */ int dup(struct dup_args *uap) { @@ -216,6 +226,9 @@ dup(struct dup_args *uap) return (error); } +/* + * MPALMOSTSAFE - acquires mplock for fp operations + */ int kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred) { @@ -228,33 +241,38 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred) KKASSERT(p); - if ((fp = holdfp(p->p_fd, fd, -1)) == NULL) - return (EBADF); - + /* + * Operations on file descriptors that do not require a file pointer. + */ switch (cmd) { - case F_DUPFD: - newmin = dat->fc_fd; - if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || - newmin > maxfilesperproc) { - error = EINVAL; - break; - } - error = kern_dup(DUP_VARIABLE, fd, newmin, &dat->fc_fd); - break; - case F_GETFD: error = fgetfdflags(p->p_fd, fd, &tmp); if (error == 0) dat->fc_cloexec = (tmp & UF_EXCLOSE) ? FD_CLOEXEC : 0; - break; + return (error); case F_SETFD: if (dat->fc_cloexec & FD_CLOEXEC) error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); else error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); + return (error); + case F_DUPFD: + newmin = dat->fc_fd; + error = kern_dup(DUP_VARIABLE, fd, newmin, &dat->fc_fd); + return (error); + default: break; + } + /* + * Operations on file pointers + */ + if ((fp = holdfp(p->p_fd, fd, -1)) == NULL) + return (EBADF); + + get_mplock(); + switch (cmd) { case F_GETFL: dat->fc_flags = OFLAGS(fp->f_flag); error = 0; @@ -368,12 +386,16 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred) error = EINVAL; break; } + rel_mplock(); + fdrop(fp); return (error); } /* * The file control system call. + * + * MPSAFE */ int fcntl(struct fcntl_args *uap) @@ -398,7 +420,7 @@ fcntl(struct fcntl_args *uap) case F_SETLK: case F_GETLK: error = copyin((caddr_t)uap->arg, &dat.fc_flock, - sizeof(struct flock)); + sizeof(struct flock)); if (error) return (error); break; @@ -436,6 +458,8 @@ fcntl(struct fcntl_args *uap) * kern_dup() to destructively dup over an existing file descriptor if new * is already open. DUP_VARIABLE tells kern_dup() to find the lowest * unused file descriptor that is greater than or equal to new. + * + * MPSAFE */ int kern_dup(enum dup_type type, int old, int new, int *res) @@ -454,18 +478,24 @@ kern_dup(enum dup_type type, int old, int new, int *res) * possibly to dup to. */ retry: - if (old < 0 || new < 0 || new > p->p_rlimit[RLIMIT_NOFILE].rlim_cur || - new >= maxfilesperproc) - return (EBADF); - if (old >= fdp->fd_nfiles || fdp->fd_files[old].fp == NULL) + spin_lock_wr(&fdp->fd_spin); + if (new < 0 || new > p->p_rlimit[RLIMIT_NOFILE].rlim_cur || + new >= maxfilesperproc) { + spin_unlock_wr(&fdp->fd_spin); + return (EINVAL); + } + if ((unsigned)old >= fdp->fd_nfiles || fdp->fd_files[old].fp == NULL) { + spin_unlock_wr(&fdp->fd_spin); return (EBADF); + } if (type == DUP_FIXED && old == new) { *res = new; + spin_unlock_wr(&fdp->fd_spin); return (0); } fp = fdp->fd_files[old].fp; oldflags = fdp->fd_files[old].fileflags; - fhold(fp); + fhold(fp); /* MPSAFE - can be called with a spinlock held */ /* * Allocate a new descriptor if DUP_VARIABLE, or expand the table @@ -479,23 +509,39 @@ retry: * setup for the next code block. */ if (type == DUP_VARIABLE || new >= fdp->fd_nfiles) { + spin_unlock_wr(&fdp->fd_spin); error = fdalloc(p, new, &newfd); + spin_lock_wr(&fdp->fd_spin); if (error) { + spin_unlock_wr(&fdp->fd_spin); fdrop(fp); return (error); } + /* + * Check for ripout + */ if (old >= fdp->fd_nfiles || fdp->fd_files[old].fp != fp) { - fsetfd(p, NULL, newfd); + fsetfd_locked(fdp, NULL, newfd); + spin_unlock_wr(&fdp->fd_spin); fdrop(fp); goto retry; } + /* + * Check for expansion race + */ if (type != DUP_VARIABLE && new != newfd) { - fsetfd(p, NULL, newfd); + fsetfd_locked(fdp, NULL, newfd); + spin_unlock_wr(&fdp->fd_spin); fdrop(fp); goto retry; } + /* + * Check for ripout, newfd reused old (this case probably + * can't occur). + */ if (old == newfd) { - fsetfd(p, NULL, newfd); + fsetfd_locked(fdp, NULL, newfd); + spin_unlock_wr(&fdp->fd_spin); fdrop(fp); goto retry; } @@ -503,6 +549,7 @@ retry: delfp = NULL; } else { if (fdp->fd_files[new].reserved) { + spin_unlock_wr(&fdp->fd_spin); fdrop(fp); printf("Warning: dup(): target descriptor %d is reserved, waiting for it to be resolved\n", new); tsleep(fdp, 0, "fdres", hz); @@ -512,19 +559,24 @@ retry: /* * If the target descriptor was never allocated we have * to allocate it. If it was we have to clean out the - * old descriptor. + * old descriptor. delfp inherits the ref from the + * descriptor table. */ delfp = fdp->fd_files[new].fp; fdp->fd_files[new].fp = NULL; fdp->fd_files[new].reserved = 1; if (delfp == NULL) { - fdreserve(fdp, new, 1); + fdreserve_locked(fdp, new, 1); if (new > fdp->fd_lastfile) fdp->fd_lastfile = new; } } + /* + * NOTE: still holding an exclusive spinlock + */ + /* * If a descriptor is being overwritten we may hve to tell * fdfree() to sleep to ensure that all relevant process @@ -546,9 +598,10 @@ retry: * * The fd_files[] array inherits fp's hold reference. */ - fsetfd(p, fp, new); - fdrop(fp); + fsetfd_locked(fdp, fp, new); fdp->fd_files[new].fileflags = oldflags & ~UF_EXCLOSE; + spin_unlock_wr(&fdp->fd_spin); + fdrop(fp); *res = new; /* @@ -557,13 +610,17 @@ retry: * close() were performed on it). */ if (delfp) { - (void) closef(delfp, td); + (void)closef(delfp, td); if (holdleaders) { + spin_lock_wr(&fdp->fd_spin); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; + spin_unlock_wr(&fdp->fd_spin); wakeup(&fdp->fd_holdleaderscount); + } else { + spin_unlock_wr(&fdp->fd_spin); } } } @@ -687,15 +744,20 @@ fgetown(struct sigio *sigio) /* * Close many file descriptors. + * + * MPSAFE */ -/* ARGSUSED */ - int closefrom(struct closefrom_args *uap) { return(kern_closefrom(uap->fd)); } +/* + * Close all file descriptors greater then or equal to fd + * + * MPSAFE + */ int kern_closefrom(int fd) { @@ -706,35 +768,43 @@ kern_closefrom(int fd) KKASSERT(p); fdp = p->p_fd; - if ((unsigned)fd > fdp->fd_lastfile) - return (0); + if (fd < 0) + return (EINVAL); /* * NOTE: This function will skip unassociated descriptors and * reserved descriptors that have not yet been assigned. * fd_lastfile can change as a side effect of kern_close(). */ + spin_lock_wr(&fdp->fd_spin); while (fd <= fdp->fd_lastfile) { if (fdp->fd_files[fd].fp != NULL) { + spin_unlock_wr(&fdp->fd_spin); + /* ok if this races another close */ if (kern_close(fd) == EINTR) return (EINTR); + spin_lock_wr(&fdp->fd_spin); } ++fd; } + spin_unlock_wr(&fdp->fd_spin); return (0); } /* * Close a file descriptor. + * + * MPSAFE */ -/* ARGSUSED */ - int close(struct close_args *uap) { return(kern_close(uap->fd)); } +/* + * MPALMOSTSAFE - acquires mplock around knote_fdclose() calls + */ int kern_close(int fd) { @@ -748,8 +818,11 @@ kern_close(int fd) KKASSERT(p); fdp = p->p_fd; - if ((fp = funsetfd(fdp, fd)) == NULL) + spin_lock_wr(&fdp->fd_spin); + if ((fp = funsetfd_locked(fdp, fd)) == NULL) { + spin_unlock_wr(&fdp->fd_spin); return (EBADF); + } holdleaders = 0; if (p->p_fdtol != NULL) { /* @@ -764,15 +837,23 @@ kern_close(int fd) * we now hold the fp reference that used to be owned by the descriptor * array. */ - if (fd < fdp->fd_knlistsize) + spin_unlock_wr(&fdp->fd_spin); + if (fd < fdp->fd_knlistsize) { + get_mplock(); knote_fdclose(p, fd); + rel_mplock(); + } error = closef(fp, td); if (holdleaders) { + spin_lock_wr(&fdp->fd_spin); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; + spin_unlock_wr(&fdp->fd_spin); wakeup(&fdp->fd_holdleaderscount); + } else { + spin_unlock_wr(&fdp->fd_spin); } } return (error); @@ -786,17 +867,13 @@ kern_shutdown(int fd, int how) { struct thread *td = curthread; struct proc *p = td->td_proc; - struct filedesc *fdp; struct file *fp; int error; KKASSERT(p); - fdp = p->p_fd; - if ((unsigned)fd >= fdp->fd_nfiles || - (fp = fdp->fd_files[fd].fp) == NULL) + if ((fp = holdfp(p->p_fd, fd, -1)) == NULL) return (EBADF); - fhold(fp); error = fo_shutdown(fp, how); fdrop(fp); @@ -818,17 +895,13 @@ kern_fstat(int fd, struct stat *ub) { struct thread *td = curthread; struct proc *p = td->td_proc; - struct filedesc *fdp; struct file *fp; int error; KKASSERT(p); - fdp = p->p_fd; - if ((unsigned)fd >= fdp->fd_nfiles || - (fp = fdp->fd_files[fd].fp) == NULL) + if ((fp = holdfp(p->p_fd, fd, -1)) == NULL) return (EBADF); - fhold(fp); error = fo_stat(fp, ub, p->p_ucred); fdrop(fp); @@ -860,18 +933,14 @@ fpathconf(struct fpathconf_args *uap) { struct thread *td = curthread; struct proc *p = td->td_proc; - struct filedesc *fdp; struct file *fp; struct vnode *vp; int error = 0; KKASSERT(p); - fdp = p->p_fd; - if ((unsigned)uap->fd >= fdp->fd_nfiles || - (fp = fdp->fd_files[uap->fd].fp) == NULL) - return (EBADF); - fhold(fp); + if ((fp = holdfp(p->p_fd, uap->fd, -1)) == NULL) + return (EBADF); switch (fp->f_type) { case DTYPE_PIPE: @@ -899,8 +968,16 @@ fpathconf(struct fpathconf_args *uap) static int fdexpand; SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, ""); +/* + * Grow the file table so it can hold through descriptor (want). + * + * The fdp's spinlock must be held exclusively on entry and may be held + * exclusively on return. The spinlock may be cycled by the routine. + * + * MPSAFE + */ static void -fdgrow(struct filedesc *fdp, int want) +fdgrow_locked(struct filedesc *fdp, int want) { struct fdnode *newfiles; struct fdnode *oldfiles; @@ -912,14 +989,18 @@ fdgrow(struct filedesc *fdp, int want) nf = 2 * nf + 1; } while (nf <= want); + spin_unlock_wr(&fdp->fd_spin); newfiles = malloc(nf * sizeof(struct fdnode), M_FILEDESC, M_WAITOK); + spin_lock_wr(&fdp->fd_spin); /* - * deal with file-table extend race that might have occured - * when malloc was blocked. + * We could have raced another extend while we were not holding + * the spinlock. */ if (fdp->fd_nfiles >= nf) { + spin_unlock_wr(&fdp->fd_spin); free(newfiles, M_FILEDESC); + spin_lock_wr(&fdp->fd_spin); return; } /* @@ -934,8 +1015,11 @@ fdgrow(struct filedesc *fdp, int want) fdp->fd_files = newfiles; fdp->fd_nfiles = nf; - if (oldfiles != fdp->fd_builtin_files) + if (oldfiles != fdp->fd_builtin_files) { + spin_unlock_wr(&fdp->fd_spin); free(oldfiles, M_FILEDESC); + spin_lock_wr(&fdp->fd_spin); + } fdexpand++; } @@ -966,9 +1050,15 @@ left_ancestor(int n) return ((n & (n + 1)) - 1); } +/* + * Traverse the in-place binary tree buttom-up adjusting the allocation + * count so scans can determine where free descriptors are located. + * + * MPSAFE - caller must be holding an exclusive spinlock on fdp + */ static void -fdreserve(struct filedesc *fdp, int fd, int incr) +fdreserve_locked(struct filedesc *fdp, int fd, int incr) { while (fd >= 0) { fdp->fd_files[fd].allocated += incr; @@ -981,6 +1071,8 @@ fdreserve(struct filedesc *fdp, int fd, int incr) * Reserve a file descriptor for the process. If no error occurs, the * caller MUST at some point call fsetfd() or assign a file pointer * or dispose of the reservation. + * + * MPSAFE */ int fdalloc(struct proc *p, int want, int *result) @@ -988,11 +1080,14 @@ fdalloc(struct proc *p, int want, int *result) struct filedesc *fdp = p->p_fd; int fd, rsize, rsum, node, lim; + spin_lock_rd(&p->p_limit->p_spin); lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); + spin_unlock_rd(&p->p_limit->p_spin); if (want >= lim) return (EMFILE); + spin_lock_wr(&fdp->fd_spin); if (want >= fdp->fd_nfiles) - fdgrow(fdp, want); + fdgrow_locked(fdp, want); /* * Search for a free descriptor starting at the higher @@ -1040,9 +1135,11 @@ retry: /* * No space in current array. Expand? */ - if (fdp->fd_nfiles >= lim) + if (fdp->fd_nfiles >= lim) { + spin_unlock_wr(&fdp->fd_spin); return (EMFILE); - fdgrow(fdp, want); + } + fdgrow_locked(fdp, want); goto retry; found: @@ -1056,13 +1153,16 @@ found: KKASSERT(fdp->fd_files[fd].reserved == 0); fdp->fd_files[fd].fileflags = 0; fdp->fd_files[fd].reserved = 1; - fdreserve(fdp, fd, 1); + fdreserve_locked(fdp, fd, 1); + spin_unlock_wr(&fdp->fd_spin); return (0); } /* * Check to see whether n user file descriptors * are available to the process p. + * + * MPSAFE */ int fdavail(struct proc *p, int n) @@ -1071,16 +1171,24 @@ fdavail(struct proc *p, int n) struct fdnode *fdnode; int i, lim, last; + spin_lock_rd(&p->p_limit->p_spin); lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); - if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) - return (1); + spin_unlock_rd(&p->p_limit->p_spin); + spin_lock_rd(&fdp->fd_spin); + if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) { + spin_unlock_rd(&fdp->fd_spin); + return (1); + } last = min(fdp->fd_nfiles, lim); fdnode = &fdp->fd_files[fdp->fd_freefile]; for (i = last - fdp->fd_freefile; --i >= 0; ++fdnode) { - if (fdnode->fp == NULL && --n <= 0) + if (fdnode->fp == NULL && --n <= 0) { + spin_unlock_rd(&fdp->fd_spin); return (1); + } } + spin_unlock_rd(&fdp->fd_spin); return (0); } @@ -1097,6 +1205,8 @@ fdavail(struct proc *p, int n) * file pointer is NOT associated with the descriptor. If falloc * returns success, fsetfd() MUST be called to either associate the * file pointer or clear the reservation. + * + * NOT MPSAFE (MPUNSAFE) - crhold() */ int falloc(struct proc *p, struct file **resultfp, int *resultfd) @@ -1172,11 +1282,13 @@ checkfpclosed(struct filedesc *fdp, int fd, struct file *fp) * * If fp is NULL, the file descriptor is returned to the pool. */ -void -fsetfd(struct proc *p, struct file *fp, int fd) -{ - struct filedesc *fdp = p->p_fd; +/* + * MPSAFE (exclusive spinlock must be held on call) + */ +static void +fsetfd_locked(struct filedesc *fdp, struct file *fp, int fd) +{ KKASSERT((unsigned)fd < fdp->fd_nfiles); KKASSERT(fdp->fd_files[fd].reserved != 0); if (fp) { @@ -1185,14 +1297,30 @@ fsetfd(struct proc *p, struct file *fp, int fd) fdp->fd_files[fd].reserved = 0; } else { fdp->fd_files[fd].reserved = 0; - fdreserve(fdp, fd, -1); - fdfixup(fdp, fd); + fdreserve_locked(fdp, fd, -1); + fdfixup_locked(fdp, fd); } } +/* + * MPSAFE + */ +void +fsetfd(struct proc *p, struct file *fp, int fd) +{ + struct filedesc *fdp = p->p_fd; + + spin_lock_wr(&fdp->fd_spin); + fsetfd_locked(fdp, fp, fd); + spin_unlock_wr(&fdp->fd_spin); +} + +/* + * MPSAFE (exclusive spinlock must be held on call) + */ static struct file * -funsetfd(struct filedesc *fdp, int fd) +funsetfd_locked(struct filedesc *fdp, int fd) { struct file *fp; @@ -1203,8 +1331,8 @@ funsetfd(struct filedesc *fdp, int fd) fdp->fd_files[fd].fp = NULL; fdp->fd_files[fd].fileflags = 0; - fdreserve(fdp, fd, -1); - fdfixup(fdp, fd); + fdreserve_locked(fdp, fd, -1); + fdfixup_locked(fdp, fd); return(fp); } @@ -1282,6 +1410,7 @@ fsetcred(struct file *fp, struct ucred *cr) /* * Free a file descriptor. */ +static void ffree(struct file *fp) { @@ -1314,6 +1443,8 @@ fdinit_bootstrap(struct proc *p0, struct filedesc *fdp0, int cmask) /* * Build a new filedesc structure. + * + * NOT MPSAFE (vref) */ struct filedesc * fdinit(struct proc *p) @@ -1322,6 +1453,7 @@ fdinit(struct proc *p) struct filedesc *fdp = p->p_fd; newfdp = malloc(sizeof(struct filedesc), M_FILEDESC, M_WAITOK|M_ZERO); + spin_lock_rd(&fdp->fd_spin); if (fdp->fd_cdir) { newfdp->fd_cdir = fdp->fd_cdir; vref(newfdp->fd_cdir); @@ -1342,6 +1474,7 @@ fdinit(struct proc *p) vref(newfdp->fd_jdir); newfdp->fd_njdir = cache_hold(fdp->fd_njdir); } + spin_unlock_rd(&fdp->fd_spin); /* Create the file descriptor table. */ newfdp->fd_refcnt = 1; @@ -1357,116 +1490,149 @@ fdinit(struct proc *p) /* * Share a filedesc structure. + * + * MPSAFE */ struct filedesc * fdshare(struct proc *p) { - p->p_fd->fd_refcnt++; - return (p->p_fd); + struct filedesc *fdp; + + fdp = p->p_fd; + spin_lock_wr(&fdp->fd_spin); + fdp->fd_refcnt++; + spin_unlock_wr(&fdp->fd_spin); + return (fdp); } /* * Copy a filedesc structure. + * + * MPSAFE */ struct filedesc * fdcopy(struct proc *p) { - struct filedesc *newfdp, *fdp = p->p_fd; + struct filedesc *fdp = p->p_fd; + struct filedesc *newfdp; struct fdnode *fdnode; int i; + int ni; - /* Certain daemons might not have file descriptors. */ + /* + * Certain daemons might not have file descriptors. + */ if (fdp == NULL) return (NULL); - newfdp = malloc(sizeof(struct filedesc), M_FILEDESC, M_WAITOK); - *newfdp = *fdp; - if (newfdp->fd_cdir) { + /* + * Allocate the new filedesc and fd_files[] array. This can race + * with operations by other threads on the fdp so we have to be + * careful. + */ + newfdp = malloc(sizeof(struct filedesc), M_FILEDESC, M_WAITOK | M_ZERO); +again: + spin_lock_rd(&fdp->fd_spin); + if (fdp->fd_lastfile < NDFILE) { + newfdp->fd_files = newfdp->fd_builtin_files; + i = NDFILE; + } else { + /* + * We have to allocate (N^2-1) entries for our in-place + * binary tree. Allow the table to shrink. + */ + i = fdp->fd_nfiles; + ni = (i - 1) / 2; + while (ni > fdp->fd_lastfile && ni > NDFILE) { + i = ni; + ni = (i - 1) / 2; + } + spin_unlock_rd(&fdp->fd_spin); + newfdp->fd_files = malloc(i * sizeof(struct fdnode), + M_FILEDESC, M_WAITOK | M_ZERO); + + /* + * Check for race, retry + */ + spin_lock_rd(&fdp->fd_spin); + if (i <= fdp->fd_lastfile) { + spin_unlock_rd(&fdp->fd_spin); + free(newfdp->fd_files, M_FILEDESC); + goto again; + } + } + + /* + * Dup the remaining fields. vref() and cache_hold() can be + * safely called while holding the read spinlock on fdp. + * + * The read spinlock on fdp is still being held. + * + * NOTE: vref and cache_hold calls for the case where the vnode + * or cache entry already has at least one ref may be called + * while holding spin locks. + */ + if ((newfdp->fd_cdir = fdp->fd_cdir) != NULL) { vref(newfdp->fd_cdir); - newfdp->fd_ncdir = cache_hold(newfdp->fd_ncdir); + newfdp->fd_ncdir = cache_hold(fdp->fd_ncdir); } /* * We must check for fd_rdir here, at least for now because * the init process is created before we have access to the * rootvode to take a reference to it. */ - if (newfdp->fd_rdir) { + if ((newfdp->fd_rdir = fdp->fd_rdir) != NULL) { vref(newfdp->fd_rdir); - newfdp->fd_nrdir = cache_hold(newfdp->fd_nrdir); + newfdp->fd_nrdir = cache_hold(fdp->fd_nrdir); } - if (newfdp->fd_jdir) { + if ((newfdp->fd_jdir = fdp->fd_jdir) != NULL) { vref(newfdp->fd_jdir); - newfdp->fd_njdir = cache_hold(newfdp->fd_njdir); + newfdp->fd_njdir = cache_hold(fdp->fd_njdir); } newfdp->fd_refcnt = 1; - spin_init(&newfdp->fd_spin); - - /* - * If the number of open files fits in the internal arrays - * of the open file structure, use them, otherwise allocate - * additional memory for the number of descriptors currently - * in use. - */ - if (newfdp->fd_lastfile < NDFILE) { - newfdp->fd_files = newfdp->fd_builtin_files; - i = NDFILE; - } else { - /* - * Compute the smallest file table size - * for the file descriptors currently in use, - * allowing the table to shrink. - */ - i = newfdp->fd_nfiles; - while ((i-1)/2 > newfdp->fd_lastfile && (i-1)/2 > NDFILE) - i = (i-1)/2; - newfdp->fd_files = malloc(i * sizeof(struct fdnode), - M_FILEDESC, M_WAITOK); - } newfdp->fd_nfiles = i; - - if (fdp->fd_files != fdp->fd_builtin_files || - newfdp->fd_files != newfdp->fd_builtin_files - ) { - bcopy(fdp->fd_files, newfdp->fd_files, - i * sizeof(struct fdnode)); - } + newfdp->fd_lastfile = fdp->fd_lastfile; + newfdp->fd_freefile = fdp->fd_freefile; + newfdp->fd_cmask = fdp->fd_cmask; + newfdp->fd_knlist = NULL; + newfdp->fd_knlistsize = -1; + newfdp->fd_knhash = NULL; + newfdp->fd_knhashmask = 0; + spin_init(&newfdp->fd_spin); /* + * Copy the descriptor table through (i). This also copies the + * allocation state. Then go through and ref the file pointers + * and clean up any KQ descriptors. + * * kq descriptors cannot be copied. Since we haven't ref'd the * copied files yet we can ignore the return value from funsetfd(). + * + * The read spinlock on fdp is still being held. */ - if (newfdp->fd_knlistsize != -1) { - for (i = 0; i <= newfdp->fd_lastfile; ++i) { - fdnode = &newfdp->fd_files[i]; - if (fdnode->fp && fdnode->fp->f_type == DTYPE_KQUEUE) { - (void)funsetfd(newfdp, i); - } - } - newfdp->fd_knlist = NULL; - newfdp->fd_knlistsize = -1; - newfdp->fd_knhash = NULL; - newfdp->fd_knhashmask = 0; - } - - /* - * Ref the copied file pointers. Make sure any reserved but - * unassigned descriptors are cleared in the copy. - */ - for (i = 0; i <= newfdp->fd_lastfile; ++i) { + bcopy(fdp->fd_files, newfdp->fd_files, i * sizeof(struct fdnode)); + for (i = 0 ; i < newfdp->fd_nfiles; ++i) { fdnode = &newfdp->fd_files[i]; if (fdnode->reserved) { - fdreserve(newfdp, i, -1); + fdreserve_locked(newfdp, i, -1); fdnode->reserved = 0; - fdfixup(newfdp, i); + fdfixup_locked(newfdp, i); + } else if (fdnode->fp) { + if (fdnode->fp->f_type == DTYPE_KQUEUE) { + (void)funsetfd_locked(newfdp, i); + } else { + fhold(fdnode->fp); + } } - if (fdnode->fp) - fhold(fdnode->fp); } + spin_unlock_rd(&fdp->fd_spin); return (newfdp); } /* * Release a filedesc structure. + * + * NOT MPSAFE (MPSAFE for refs > 1, but the final cleanup code is not MPSAFE) */ void fdfree(struct proc *p) @@ -1484,6 +1650,11 @@ fdfree(struct proc *p) if (fdp == NULL) return; + /* + * Severe messing around to follow + */ + spin_lock_wr(&fdp->fd_spin); + /* Check for special need to clear POSIX style locks */ fdtol = p->p_fdtol; if (fdtol != NULL) { @@ -1500,6 +1671,8 @@ fdfree(struct proc *p) } fp = fdnode->fp; fhold(fp); + spin_unlock_wr(&fdp->fd_spin); + lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; @@ -1511,6 +1684,7 @@ fdfree(struct proc *p) &lf, F_POSIX); fdrop(fp); + spin_lock_wr(&fdp->fd_spin); } } retry: @@ -1522,8 +1696,10 @@ fdfree(struct proc *p) * in a shared file descriptor table. */ fdp->fd_holdleaderswakeup = 1; + spin_unlock_wr(&fdp->fd_spin); tsleep(&fdp->fd_holdleaderscount, 0, "fdlhold", 0); + spin_lock_wr(&fdp->fd_spin); goto retry; } if (fdtol->fdl_holdcount > 0) { @@ -1532,7 +1708,9 @@ fdfree(struct proc *p) * remains valid in closef(). */ fdtol->fdl_wakeup = 1; + spin_unlock_wr(&fdp->fd_spin); tsleep(fdtol, 0, "fdlhold", 0); + spin_lock_wr(&fdp->fd_spin); goto retry; } } @@ -1541,14 +1719,22 @@ fdfree(struct proc *p) fdtol->fdl_holdcount == 0) { fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; fdtol->fdl_prev->fdl_next = fdtol->fdl_next; - } else + } else { fdtol = NULL; + } p->p_fdtol = NULL; - if (fdtol != NULL) + if (fdtol != NULL) { + spin_unlock_wr(&fdp->fd_spin); free(fdtol, M_FILEDESC_TO_LEADER); + spin_lock_wr(&fdp->fd_spin); + } } - if (--fdp->fd_refcnt > 0) + if (--fdp->fd_refcnt > 0) { + spin_unlock_wr(&fdp->fd_spin); return; + } + spin_unlock_wr(&fdp->fd_spin); + /* * we are the last reference to the structure, we can * safely assume it will not change out from under us. @@ -1609,40 +1795,68 @@ done: * holdsock() - load the struct file pointer associated * with a socket into *fpp. If an error occurs, non-zero * will be returned and *fpp will be set to NULL. + * + * MPSAFE */ int -holdsock(struct filedesc *fdp, int fdes, struct file **fpp) +holdsock(struct filedesc *fdp, int fd, struct file **fpp) { struct file *fp; - int error = 0; + int error; - *fpp = NULL; - if ((unsigned)fdes >= fdp->fd_nfiles) - return EBADF; - if ((fp = fdp->fd_files[fdes].fp) == NULL) - return EBADF; - if (fp->f_type != DTYPE_SOCKET) - return ENOTSOCK; + spin_lock_rd(&fdp->fd_spin); + if ((unsigned)fd >= fdp->fd_nfiles) { + error = EBADF; + fp = NULL; + goto done; + } + if ((fp = fdp->fd_files[fd].fp) == NULL) { + error = EBADF; + goto done; + } + if (fp->f_type != DTYPE_SOCKET) { + error = ENOTSOCK; + goto done; + } fhold(fp); + error = 0; +done: + spin_unlock_rd(&fdp->fd_spin); *fpp = fp; return (error); } /* - * Convert a user file descriptor to a kernel file entry. + * Convert a user file descriptor to a held file pointer. + * + * MPSAFE */ int -getvnode(struct filedesc *fdp, int fd, struct file **fpp) +holdvnode(struct filedesc *fdp, int fd, struct file **fpp) { struct file *fp; - - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_files[fd].fp) == NULL) - return (EBADF); - if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) - return (EINVAL); + int error; + + spin_lock_rd(&fdp->fd_spin); + if ((unsigned)fd >= fdp->fd_nfiles) { + error = EBADF; + fp = NULL; + goto done; + } + if ((fp = fdp->fd_files[fd].fp) == NULL) { + error = EBADF; + goto done; + } + if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { + error = EINVAL; + goto done; + } + fhold(fp); + error = 0; +done: + spin_unlock_rd(&fdp->fd_spin); *fpp = fp; - return (0); + return (error); } /* @@ -1668,6 +1882,8 @@ is_unsafe(struct file *fp) /* * Make this setguid thing safe, if at all possible. + * + * NOT MPSAFE - scans fdp without spinlocks, calls knote_fdclose() */ void setugidsafety(struct proc *p) @@ -1696,7 +1912,7 @@ setugidsafety(struct proc *p) * NULL-out descriptor prior to close to avoid * a race while close blocks. */ - if ((fp = funsetfd(fdp, i)) != NULL) + if ((fp = funsetfd_locked(fdp, i)) != NULL) closef(fp, td); } } @@ -1704,6 +1920,8 @@ setugidsafety(struct proc *p) /* * Close any files on exec? + * + * NOT MPSAFE - scans fdp without spinlocks, calls knote_fdclose() */ void fdcloseexec(struct proc *p) @@ -1731,7 +1949,7 @@ fdcloseexec(struct proc *p) * NULL-out descriptor prior to close to avoid * a race while close blocks. */ - if ((fp = funsetfd(fdp, i)) != NULL) + if ((fp = funsetfd_locked(fdp, i)) != NULL) closef(fp, td); } } @@ -1743,6 +1961,8 @@ fdcloseexec(struct proc *p) * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. + * + * NOT MPSAFE - calls falloc, vn_open, etc */ int fdcheckstd(struct proc *p) @@ -1793,6 +2013,8 @@ fdcheckstd(struct proc *p) * Decrement reference count on file structure. * Note: td and/or p may be NULL when closing a file * that was being passed in a message. + * + * MPALMOSTSAFE - acquires mplock for VOP operations */ int closef(struct file *fp, struct thread *td) @@ -1810,6 +2032,7 @@ closef(struct file *fp, struct thread *td) } else { p = td->td_proc; /* can also be NULL */ } + get_mplock(); /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting @@ -1859,6 +2082,7 @@ closef(struct file *fp, struct thread *td) } } } + rel_mplock(); return (fdrop(fp)); } @@ -1867,9 +2091,11 @@ closef(struct file *fp, struct thread *td) * * fhold() can only be called if f_count is already at least 1 (i.e. the * caller of fhold() already has a reference to the file pointer in some - * manner or other). In addition, fhold() must be callable with a spinlock - * held on the governing structure that the caller went through to find the - * fp. + * manner or other). + * + * This is a rare case where callers are allowed to hold spinlocks, so + * we can't ourselves. Since we are not obtaining the fp spinlock, + * we have to use an atomic lock to interlock against fdrop(). */ void fhold(struct file *fp) @@ -1878,9 +2104,10 @@ fhold(struct file *fp) } /* - * MPSAFE TODO: VOP_ADVLOCK, fo_close + * A spinlock is required to handle 1->0 transitions on f_count. We have + * to use atomic_sub_int so as not to race the atomic_add_int in fhold(). * - * A spinlock is required to handle 1->0 transitions on f_count. + * MPALMOSTSAFE - acquires mplock for final close sequence */ int fdrop(struct file *fp) @@ -1890,12 +2117,15 @@ fdrop(struct file *fp) int error; spin_lock_wr(&fp->f_spin); - if (--fp->f_count > 0) { + atomic_subtract_int(&fp->f_count, 1); + if (fp->f_count > 0) { spin_unlock_wr(&fp->f_spin); return (0); } spin_unlock_wr(&fp->f_spin); + get_mplock(); + /* * The last reference has gone away, we own the fp structure free * and clear. @@ -1915,6 +2145,7 @@ fdrop(struct file *fp) else error = 0; ffree(fp); + rel_mplock(); return (error); } @@ -1924,21 +2155,21 @@ fdrop(struct file *fp) * Just attempt to get a record lock of the requested type on * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ -/* ARGSUSED */ int flock(struct flock_args *uap) { struct proc *p = curproc; - struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; struct flock lf; + int error; - if ((unsigned)uap->fd >= fdp->fd_nfiles || - (fp = fdp->fd_files[uap->fd].fp) == NULL) + if ((fp = holdfp(p->p_fd, uap->fd, -1)) == NULL) return (EBADF); - if (fp->f_type != DTYPE_VNODE) - return (EOPNOTSUPP); + if (fp->f_type != DTYPE_VNODE) { + error = EOPNOTSUPP; + goto done; + } vp = (struct vnode *)fp->f_data; lf.l_whence = SEEK_SET; lf.l_start = 0; @@ -1946,18 +2177,25 @@ flock(struct flock_args *uap) if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; fp->f_flag &= ~FHASLOCK; - return (VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0)); + error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0); + goto done; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; - else - return (EBADF); + else { + error = EBADF; + goto done; + } fp->f_flag |= FHASLOCK; if (uap->how & LOCK_NB) - return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 0)); - return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_WAIT)); + error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 0); + else + error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_WAIT); +done: + fdrop(fp); + return (error); } /* @@ -1989,6 +2227,8 @@ fdopen(dev_t dev, int mode, int type, struct thread *td) /* * The caller has reserved the file descriptor dfd for us. On success we * must fsetfd() it. On failure the caller will clean it up. + * + * NOT MPSAFE - isn't getting spinlocks, possibly other things */ int dupfdopen(struct proc *p, int dfd, int sfd, int mode, int error) @@ -2028,7 +2268,7 @@ dupfdopen(struct proc *p, int dfd, int sfd, int mode, int error) */ fdp->fd_files[dfd].fileflags = fdp->fd_files[sfd].fileflags; fsetfd(p, wfp, dfd); - if ((xfp = funsetfd(fdp, sfd)) != NULL) + if ((xfp = funsetfd_locked(fdp, sfd)) != NULL) fdrop(xfp); KKASSERT(xfp == wfp); /* XXX MP RACE */ error = 0; @@ -2040,7 +2280,10 @@ dupfdopen(struct proc *p, int dfd, int sfd, int mode, int error) return (error); } - +/* + * NOT MPSAFE - I think these refer to a common file descriptor table + * and we need to spinlock that to link fdtol in. + */ struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct proc *leader) @@ -2067,6 +2310,8 @@ filedesc_to_leader_alloc(struct filedesc_to_leader *old, /* * Get file structures. + * + * NOT MPSAFE - process list scan, SYSCTL_OUT (probably not mpsafe) */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) @@ -2110,6 +2355,7 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS) if ((fdp = p->p_fd) == NULL) continue; PHOLD(p); + spin_lock_rd(&fdp->fd_spin); for (n = 0; n < fdp->fd_nfiles; ++n) { if ((fp = fdp->fd_files[n].fp) == NULL) continue; @@ -2118,11 +2364,14 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS) } else { uid = p->p_ucred ? p->p_ucred->cr_uid : -1; kcore_make_file(&kf, fp, p->p_pid, uid, n); + spin_unlock_rd(&fdp->fd_spin); error = SYSCTL_OUT(req, &kf, sizeof(kf)); + spin_lock_rd(&fdp->fd_spin); if (error) break; } } + spin_unlock_rd(&fdp->fd_spin); PRELE(p); if (error) break; diff --git a/sys/kern/kern_slaballoc.c b/sys/kern/kern_slaballoc.c index 17b14a3d01..9ca6262bff 100644 --- a/sys/kern/kern_slaballoc.c +++ b/sys/kern/kern_slaballoc.c @@ -1,5 +1,5 @@ /* - * KERN_SLABALLOC.C - Kernel SLAB memory allocator (MP SAFE) + * KERN_SLABALLOC.C - Kernel SLAB memory allocator * * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. * @@ -33,7 +33,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/kern/kern_slaballoc.c,v 1.37 2005/12/07 04:49:54 dillon Exp $ + * $DragonFly: src/sys/kern/kern_slaballoc.c,v 1.38 2006/05/24 03:23:31 dillon Exp $ * * This module implements a slab allocator drop-in replacement for the * kernel malloc(). @@ -372,7 +372,7 @@ zoneindex(unsigned long *bytes) } /* - * malloc() (SLAB ALLOCATOR) (MPSAFE) + * malloc() (SLAB ALLOCATOR) * * Allocate memory via the slab allocator. If the request is too large, * or if it page-aligned beyond a certain size, we fall back to the @@ -384,6 +384,8 @@ zoneindex(unsigned long *bytes) * M_ZERO - zero the returned memory. * M_USE_RESERVE - allow greater drawdown of the free list * M_USE_INTERRUPT_RESERVE - allow the freelist to be exhausted + * + * MPSAFE */ void * malloc(unsigned long size, struct malloc_type *type, int flags) @@ -777,11 +779,13 @@ free_remote(void *ptr) #endif /* - * free (SLAB ALLOCATOR) (MP SAFE) + * free (SLAB ALLOCATOR) * * Free a memory block previously allocated by malloc. Note that we do not * attempt to uplodate ks_loosememuse as MP races could prevent us from * checking memory limits in malloc. + * + * MPSAFE */ void free(void *ptr, struct malloc_type *type) @@ -1003,7 +1007,7 @@ chunk_mark_free(SLZone *z, void *chunk) #endif /* - * kmem_slab_alloc() (MP SAFE) (GETS BGL) + * kmem_slab_alloc() * * Directly allocate and wire kernel memory in PAGE_SIZE chunks with the * specified alignment. M_* flags are expected in the flags field. @@ -1021,6 +1025,8 @@ chunk_mark_free(SLZone *z, void *chunk) * it is free to use PQ_CACHE pages. * * This routine will currently obtain the BGL. + * + * MPALMOSTSAFE - acquires mplock */ static void * kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags) @@ -1174,7 +1180,9 @@ kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags) } /* - * kmem_slab_free() (MP SAFE) (GETS BGL) + * kmem_slab_free() + * + * MPALMOSTSAFE - acquires mplock */ static void kmem_slab_free(void *ptr, vm_size_t size) diff --git a/sys/kern/subr_kcore.c b/sys/kern/subr_kcore.c index 5996cc2b95..4e4f084032 100644 --- a/sys/kern/subr_kcore.c +++ b/sys/kern/subr_kcore.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/kern/subr_kcore.c,v 1.1 2004/11/24 22:51:01 joerg Exp $ + * $DragonFly: src/sys/kern/subr_kcore.c,v 1.2 2006/05/24 03:23:31 dillon Exp $ */ /* @@ -54,6 +54,9 @@ # include #endif +/* + * MPSAFE - caller must hold requisit spinlocks + */ void kcore_make_file(struct kinfo_file *ufile, struct file *kfile, pid_t pid, uid_t owner, int n) diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 2c4eacf31b..614501aadc 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -1,4 +1,4 @@ - $DragonFly: src/sys/kern/syscalls.master,v 1.33 2006/05/23 20:35:10 dillon Exp $ + $DragonFly: src/sys/kern/syscalls.master,v 1.34 2006/05/24 03:23:31 dillon Exp $ ; @(#)syscalls.master 8.2 (Berkeley) 1/13/94 ; $FreeBSD: src/sys/kern/syscalls.master,v 1.72.2.10 2002/07/12 08:22:46 alfred Exp $ @@ -155,7 +155,7 @@ gethostname gethostname_args int 88 COMPAT BSD { int sethostname(char *hostname, u_int len); } \ sethostname sethostname_args int -89 STD BSD { int getdtablesize(void); } +89 MPSAFE STD BSD { int getdtablesize(void); } 90 STD POSIX { int dup2(u_int from, u_int to); } 91 UNIMPL BSD getdopt 92 STD POSIX { int fcntl(int fd, int cmd, long arg); } diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index af2b689c23..5fb4ae47c6 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -67,7 +67,7 @@ * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 * $FreeBSD: src/sys/kern/vfs_cache.c,v 1.42.2.6 2001/10/05 20:07:03 dillon Exp $ - * $DragonFly: src/sys/kern/vfs_cache.c,v 1.67 2006/05/06 02:43:12 dillon Exp $ + * $DragonFly: src/sys/kern/vfs_cache.c,v 1.68 2006/05/24 03:23:31 dillon Exp $ */ #include @@ -202,12 +202,17 @@ static void cache_zap(struct namecache *ncp); * cache_hold() and cache_drop() prevent the premature deletion of a * namecache entry but do not prevent operations (such as zapping) on * that namecache entry. + * + * This routine may only be called if nc_refs is already at least 1. + * + * This is a rare case where callers are allowed to hold spinlocks, so + * we can't ourselves. */ static __inline struct namecache * _cache_hold(struct namecache *ncp) { - ++ncp->nc_refs; + atomic_add_int(&ncp->nc_refs, 1); return(ncp); } @@ -317,6 +322,9 @@ cache_free(struct namecache *ncp) /* * Ref and deref a namecache structure. + * + * Warning: caller may hold an unrelated read spinlock, which means we can't + * use read spinlocks here. */ struct namecache * cache_hold(struct namecache *ncp) diff --git a/sys/kern/vfs_lock.c b/sys/kern/vfs_lock.c index 35d3365623..80ef065df8 100644 --- a/sys/kern/vfs_lock.c +++ b/sys/kern/vfs_lock.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/kern/vfs_lock.c,v 1.17 2006/05/06 02:43:12 dillon Exp $ + * $DragonFly: src/sys/kern/vfs_lock.c,v 1.18 2006/05/24 03:23:31 dillon Exp $ */ /* @@ -86,6 +86,9 @@ vfs_lock_init(void) /* * Inline helper functions. vbusy() and vfree() must be called while in a * critical section. + * + * Warning: must be callable if the caller holds a read spinlock to something + * else, meaning we can't use read spinlocks here. */ static __inline void @@ -134,6 +137,9 @@ vshouldfree(struct vnode *vp, int usecount) * * Special cases: refing a vnode does not clear VINACTIVE, you have to vget() * the vnode shared or exclusive to do that. + * + * Warning: must be callable if the caller holds a read spinlock to something + * else, meaning we can't use read spinlocks here. */ static __inline void @@ -144,6 +150,16 @@ __vref(struct vnode *vp) __vbusy(vp); } +/* + * This is a rare case where callers are allowed to hold spinlocks, so + * we can't ourselves. In such cases the vnode must already have at least + * one reference because we cannot get the spinlock required to move + * the vnode off the free list. + * + * If the usecount & holdcnt are 0 the caller must be holding the + * free list spinlock since we will be removing the vnode from the + * freelist in that case. + */ void vref(struct vnode *vp) { diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 6895e87423..81d6efef8e 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -37,7 +37,7 @@ * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $ - * $DragonFly: src/sys/kern/vfs_syscalls.c,v 1.92 2006/05/22 21:21:21 dillon Exp $ + * $DragonFly: src/sys/kern/vfs_syscalls.c,v 1.93 2006/05/24 03:23:31 dillon Exp $ */ #include @@ -870,22 +870,23 @@ kern_fstatfs(int fd, struct statfs *buf) int error; KKASSERT(p); - error = getvnode(p->p_fd, fd, &fp); - if (error) + if ((error = holdvnode(p->p_fd, fd, &fp)) != 0) return (error); mp = ((struct vnode *)fp->f_data)->v_mount; - if (mp == NULL) - return (EBADF); - if (fp->f_cred == NULL) - return (EINVAL); + if (mp == NULL) { + error = EBADF; + goto done; + } + if (fp->f_cred == NULL) { + error = EINVAL; + goto done; + } sp = &mp->mnt_stat; - error = VFS_STATFS(mp, sp, fp->f_cred); - if (error) - return (error); + if ((error = VFS_STATFS(mp, sp, fp->f_cred)) != 0) + goto done; - error = cache_fullpath(p, mp->mnt_ncp, &fullpath, &freepath); - if (error) - return(error); + if ((error = cache_fullpath(p, mp->mnt_ncp, &fullpath, &freepath)) != 0) + goto done; bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname)); free(freepath, M_TEMP); @@ -896,7 +897,10 @@ kern_fstatfs(int fd, struct statfs *buf) /* Only root should have access to the fsid's. */ if (suser(td)) buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0; - return (0); + error = 0; +done: + fdrop(fp); + return (error); } /* @@ -1029,7 +1033,7 @@ fchdir(struct fchdir_args *uap) struct namecache *nct; int error; - if ((error = getvnode(fdp, uap->fd, &fp)) != 0) + if ((error = holdvnode(fdp, uap->fd, &fp)) != 0) return (error); vp = (struct vnode *)fp->f_data; vref(vp); @@ -1040,6 +1044,7 @@ fchdir(struct fchdir_args *uap) error = VOP_ACCESS(vp, VEXEC, p->p_ucred); if (error) { vput(vp); + fdrop(fp); return (error); } ncp = cache_hold(fp->f_ncp); @@ -1067,6 +1072,7 @@ fchdir(struct fchdir_args *uap) cache_drop(ncp); vput(vp); } + fdrop(fp); return (error); } @@ -1136,12 +1142,14 @@ chroot_refuse_vdir_fds(fdp) int fd; for (fd = 0; fd < fdp->fd_nfiles ; fd++) { - error = getvnode(fdp, fd, &fp); - if (error) + if ((error = holdvnode(fdp, fd, &fp)) != 0) continue; vp = (struct vnode *)fp->f_data; - if (vp->v_type != VDIR) + if (vp->v_type != VDIR) { + fdrop(fp); continue; + } + fdrop(fp); return(EPERM); } return (0); @@ -2133,9 +2141,11 @@ fchflags(struct fchflags_args *uap) struct file *fp; int error; - if ((error = getvnode(p->p_fd, uap->fd, &fp)) != 0) + if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0) return (error); - return setfflags((struct vnode *) fp->f_data, uap->flags); + error = setfflags((struct vnode *) fp->f_data, uap->flags); + fdrop(fp); + return (error); } static int @@ -2227,9 +2237,11 @@ fchmod(struct fchmod_args *uap) struct file *fp; int error; - if ((error = getvnode(p->p_fd, uap->fd, &fp)) != 0) + if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0) return (error); - return setfmode((struct vnode *)fp->f_data, uap->mode); + error = setfmode((struct vnode *)fp->f_data, uap->mode); + fdrop(fp); + return (error); } static int @@ -2320,10 +2332,11 @@ fchown(struct fchown_args *uap) struct file *fp; int error; - if ((error = getvnode(p->p_fd, uap->fd, &fp)) != 0) + if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0) return (error); - return setfown((struct vnode *)fp->f_data, - uap->uid, uap->gid); + error = setfown((struct vnode *)fp->f_data, uap->uid, uap->gid); + fdrop(fp); + return (error); } static int @@ -2445,10 +2458,10 @@ kern_futimes(int fd, struct timeval *tptr) error = getutimes(tptr, ts); if (error) return (error); - error = getvnode(p->p_fd, fd, &fp); - if (error) + if ((error = holdvnode(p->p_fd, fd, &fp)) != 0) return (error); error = setutimes((struct vnode *)fp->f_data, ts, tptr == NULL); + fdrop(fp); return (error); } @@ -2534,20 +2547,24 @@ kern_ftruncate(int fd, off_t length) if (length < 0) return(EINVAL); - if ((error = getvnode(p->p_fd, fd, &fp)) != 0) + if ((error = holdvnode(p->p_fd, fd, &fp)) != 0) return (error); - if ((fp->f_flag & FWRITE) == 0) - return (EINVAL); + if ((fp->f_flag & FWRITE) == 0) { + error = EINVAL; + goto done; + } vp = (struct vnode *)fp->f_data; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - if (vp->v_type == VDIR) + if (vp->v_type == VDIR) { error = EISDIR; - else if ((error = vn_writechk(vp)) == 0) { + } else if ((error = vn_writechk(vp)) == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, fp->f_cred); } VOP_UNLOCK(vp, 0); +done: + fdrop(fp); return (error); } @@ -2582,7 +2599,7 @@ fsync(struct fsync_args *uap) vm_object_t obj; int error; - if ((error = getvnode(p->p_fd, uap->fd, &fp)) != 0) + if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0) return (error); vp = (struct vnode *)fp->f_data; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); @@ -2590,9 +2607,11 @@ fsync(struct fsync_args *uap) vm_object_page_clean(obj, 0, 0, 0); if ((error = VOP_FSYNC(vp, MNT_WAIT)) == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP) && - bioops.io_fsync) + bioops.io_fsync) { error = (*bioops.io_fsync)(vp); + } VOP_UNLOCK(vp, 0); + fdrop(fp); return (error); } @@ -2861,14 +2880,18 @@ kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res, long loff; int error, eofflag; - if ((error = getvnode(p->p_fd, fd, &fp)) != 0) + if ((error = holdvnode(p->p_fd, fd, &fp)) != 0) return (error); - if ((fp->f_flag & FREAD) == 0) - return (EBADF); + if ((fp->f_flag & FREAD) == 0) { + error = EBADF; + goto done; + } vp = (struct vnode *)fp->f_data; unionread: - if (vp->v_type != VDIR) - return (EINVAL); + if (vp->v_type != VDIR) { + error = EINVAL; + goto done; + } aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; @@ -2884,14 +2907,14 @@ unionread: fp->f_offset = auio.uio_offset; VOP_UNLOCK(vp, 0); if (error) - return (error); + goto done; if (count == auio.uio_resid) { if (union_dircheckp) { error = union_dircheckp(td, &vp, fp); if (error == -1) goto unionread; if (error) - return (error); + goto done; } if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { @@ -2908,6 +2931,8 @@ unionread: *basep = loff; } *res = count - auio.uio_resid; +done: + fdrop(fp); return (error); } diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h index 1131a42fa6..07217fa799 100644 --- a/sys/sys/filedesc.h +++ b/sys/sys/filedesc.h @@ -32,7 +32,7 @@ * * @(#)filedesc.h 8.1 (Berkeley) 6/2/93 * $FreeBSD: src/sys/sys/filedesc.h,v 1.19.2.5 2003/06/06 20:21:32 tegge Exp $ - * $DragonFly: src/sys/sys/filedesc.h,v 1.17 2006/05/22 21:21:26 dillon Exp $ + * $DragonFly: src/sys/sys/filedesc.h,v 1.18 2006/05/24 03:23:33 dillon Exp $ */ #ifndef _SYS_FILEDESC_H_ @@ -163,7 +163,6 @@ int fgetfdflags(struct filedesc *fdp, int fd, int *flagsp); int fsetfdflags(struct filedesc *fdp, int fd, int add_flags); int fclrfdflags(struct filedesc *fdp, int fd, int rem_flags); void fsetcred (struct file *fp, struct ucred *cr); -void ffree (struct file *); void fdinit_bootstrap(struct proc *p0, struct filedesc *fdp0, int cmask); struct filedesc *fdinit (struct proc *p); struct filedesc *fdshare (struct proc *p); @@ -174,7 +173,7 @@ void fdcloseexec (struct proc *p); int fdcheckstd (struct proc *p); struct file *holdfp (struct filedesc *fdp, int fd, int flag); int holdsock (struct filedesc *fdp, int fdes, struct file **fpp); -int getvnode (struct filedesc *fdp, int fd, struct file **fpp); +int holdvnode (struct filedesc *fdp, int fd, struct file **fpp); int fdissequential (struct file *); void fdsequential (struct file *, int); pid_t fgetown (struct sigio *); diff --git a/sys/vfs/fdesc/fdesc_vnops.c b/sys/vfs/fdesc/fdesc_vnops.c index f1f00907ee..475372eb3d 100644 --- a/sys/vfs/fdesc/fdesc_vnops.c +++ b/sys/vfs/fdesc/fdesc_vnops.c @@ -36,7 +36,7 @@ * @(#)fdesc_vnops.c 8.9 (Berkeley) 1/21/94 * * $FreeBSD: src/sys/miscfs/fdesc/fdesc_vnops.c,v 1.47.2.1 2001/10/22 22:49:26 chris Exp $ - * $DragonFly: src/sys/vfs/fdesc/fdesc_vnops.c,v 1.29 2006/05/19 07:33:46 dillon Exp $ + * $DragonFly: src/sys/vfs/fdesc/fdesc_vnops.c,v 1.30 2006/05/24 03:23:35 dillon Exp $ */ /* @@ -386,10 +386,10 @@ fdesc_setattr(struct vop_setattr_args *ap) /* * Allow setattr where there is an underlying vnode. */ - error = getvnode(p->p_fd, fd, &fp); + error = holdvnode(p->p_fd, fd, &fp); if (error) { /* - * getvnode() returns EINVAL if the file descriptor is not + * holdvnode() returns EINVAL if the file descriptor is not * backed by a vnode. Silently drop all changes except * chflags(2) in this case. */ @@ -399,7 +399,8 @@ fdesc_setattr(struct vop_setattr_args *ap) else error = 0; } - return (error); + } else { + fdrop(fp); } return (error); } -- 2.41.0