From 359499301cb3e32b532c88c41e7df5a36111392a Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Fri, 20 Apr 2018 08:44:32 -0700 Subject: [PATCH] kernel - per-thread fd cache, p_fd lock bypass * Implement a per-thread (fd,fp) cache. Cache hits can keep fp's in a held state (avoiding the need to fhold()/fdrop() the ref count), and bypasses the p_fd spinlock. This allows the file pointer structure to generally be shared across cpu caches. * Can cache up to four descriptors in each thread, LRU. This is the common case. Highly threaded programs tend to focus work on a distinct file descriptors in each thread. * One file descriptor can be cached in up to four threads. This is a significant limitation, though relatively uncommon. On a cache miss the code drops into the normal shared p_fd spinlock lookup. --- sys/dev/disk/iscsi/initiator/iscsi.c | 4 +- sys/dev/disk/xdisk/xdisk.c | 2 +- sys/kern/imgact_elf.c | 5 +- sys/kern/kern_acl.c | 10 +- sys/kern/kern_checkpoint.c | 9 +- sys/kern/kern_descrip.c | 551 +++++++++++++++++++++++---- sys/kern/kern_event.c | 7 +- sys/kern/kern_exit.c | 6 + sys/kern/subr_diskiocom.c | 2 +- sys/kern/sys_generic.c | 24 +- sys/kern/sys_mqueue.c | 2 +- sys/kern/uipc_syscalls.c | 72 ++-- sys/kern/vfs_nlookup.c | 3 +- sys/kern/vfs_syscalls.c | 54 ++- sys/netproto/smb/smb_dev.c | 2 +- sys/sys/filedesc.h | 18 +- sys/sys/thread.h | 18 +- sys/vfs/hammer2/hammer2_ioctl.c | 2 +- sys/vfs/hammer2/hammer2_vfsops.c | 2 +- sys/vfs/nfs/nfs_syscalls.c | 2 +- sys/vm/vm_mmap.c | 4 +- 21 files changed, 601 insertions(+), 198 deletions(-) diff --git a/sys/dev/disk/iscsi/initiator/iscsi.c b/sys/dev/disk/iscsi/initiator/iscsi.c index 367464b6f3..868a6467fe 100644 --- a/sys/dev/disk/iscsi/initiator/iscsi.c +++ b/sys/dev/disk/iscsi/initiator/iscsi.c @@ -410,7 +410,7 @@ i_ping(struct cdev *dev) | low level I/O */ static int -i_setsoc(isc_session_t *sp, int fd, struct thread *td) +i_setsoc(isc_session_t *sp, int fd, thread_t td) { int error = 0; struct file *fp; @@ -424,7 +424,7 @@ i_setsoc(isc_session_t *sp, int fd, struct thread *td) debug_called(8); - if ((error = holdsock(td->td_proc->p_fd, fd, &fp)) == 0) { + if ((error = holdsock(td, fd, &fp)) == 0) { sp->soc = fp->f_data; sp->fp = fp; isc_start_receiver(sp); diff --git a/sys/dev/disk/xdisk/xdisk.c b/sys/dev/disk/xdisk/xdisk.c index 164a0fbb20..74b3a075d6 100644 --- a/sys/dev/disk/xdisk/xdisk.c +++ b/sys/dev/disk/xdisk/xdisk.c @@ -295,7 +295,7 @@ xdisk_attach(struct xdisk_attach_ioctl *xaioc) /* * Normalize ioctl params */ - fp = holdfp(curproc->p_fd, xaioc->fd, -1); + fp = holdfp(curthread, xaioc->fd, -1); if (fp == NULL) return EINVAL; xa_printf(1, "xdisk_attach fp=%p\n", fp); diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 5ece1375e4..d2d3cb9f6c 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -1576,12 +1576,14 @@ elf_putsigs(struct lwp *lp, elf_buf_t target) static int elf_putfiles(struct proc *p, elf_buf_t target, struct file *ckfp) { + thread_t td = curthread; int error = 0; int i; struct ckpt_filehdr *cfh = NULL; struct ckpt_fileinfo *cfi; struct file *fp; struct vnode *vp; + /* * the duplicated loop is gross, but it was the only way * to eliminate uninitialized variable warnings @@ -1594,8 +1596,9 @@ elf_putfiles(struct proc *p, elf_buf_t target, struct file *ckfp) /* * ignore STDIN/STDERR/STDOUT. */ + KKASSERT(td->td_proc == p); for (i = 3; error == 0 && i < p->p_fd->fd_nfiles; i++) { - fp = holdfp(p->p_fd, i, -1); + fp = holdfp(td, i, -1); if (fp == NULL) continue; /* diff --git a/sys/kern/kern_acl.c b/sys/kern/kern_acl.c index 2794b15119..4a27388fab 100644 --- a/sys/kern/kern_acl.c +++ b/sys/kern/kern_acl.c @@ -200,8 +200,7 @@ sys___acl_get_fd(struct __acl_get_fd_args *uap) struct file *fp; int error; - KKASSERT(td->td_proc); - if ((error = holdvnode(td->td_proc->p_fd, uap->filedes, &fp)) != 0) + if ((error = holdvnode(td, uap->filedes, &fp)) != 0) return(error); error = vacl_get_acl((struct vnode *)fp->f_data, uap->type, uap->aclp); fdrop(fp); @@ -219,8 +218,7 @@ sys___acl_set_fd(struct __acl_set_fd_args *uap) struct file *fp; int error; - KKASSERT(td->td_proc); - if ((error = holdvnode(td->td_proc->p_fd, uap->filedes, &fp)) != 0) + if ((error = holdvnode(td, uap->filedes, &fp)) != 0) return(error); error = vacl_set_acl((struct vnode *)fp->f_data, uap->type, uap->aclp); fdrop(fp); @@ -263,7 +261,7 @@ sys___acl_delete_fd(struct __acl_delete_fd_args *uap) int error; KKASSERT(td->td_proc); - if ((error = holdvnode(td->td_proc->p_fd, uap->filedes, &fp)) != 0) + if ((error = holdvnode(td, uap->filedes, &fp)) != 0) return(error); error = vacl_delete((struct vnode *)fp->f_data, uap->type); fdrop(fp); @@ -306,7 +304,7 @@ sys___acl_aclcheck_fd(struct __acl_aclcheck_fd_args *uap) int error; KKASSERT(td->td_proc); - if ((error = holdvnode(td->td_proc->p_fd, uap->filedes, &fp)) != 0) + if ((error = holdvnode(td, uap->filedes, &fp)) != 0) return(error); error = vacl_aclcheck((struct vnode *)fp->f_data, uap->type, uap->aclp); fdrop(fp); diff --git a/sys/kern/kern_checkpoint.c b/sys/kern/kern_checkpoint.c index 98abd500b4..fac67ab1c2 100644 --- a/sys/kern/kern_checkpoint.c +++ b/sys/kern/kern_checkpoint.c @@ -719,7 +719,6 @@ sys_sys_checkpoint(struct sys_checkpoint_args *uap) int error = 0; struct thread *td = curthread; struct proc *p = td->td_proc; - struct filedesc *fdp = p->p_fd; struct file *fp; /* @@ -742,25 +741,25 @@ sys_sys_checkpoint(struct sys_checkpoint_args *uap) fp = NULL; if (uap->fd == -1 && uap->pid == (pid_t)-1) error = checkpoint_signal_handler(td->td_lwp); - else if ((fp = holdfp(fdp, uap->fd, FWRITE)) == NULL) + else if ((fp = holdfp(td, uap->fd, FWRITE)) == NULL) error = EBADF; else error = ckpt_freeze_proc(td->td_lwp, fp); if (fp) - fdrop(fp); + dropfp(td, uap->fd, fp); break; case CKPT_THAW: if (uap->pid != -1) { error = EINVAL; break; } - if ((fp = holdfp(fdp, uap->fd, FREAD)) == NULL) { + if ((fp = holdfp(td, uap->fd, FREAD)) == NULL) { error = EBADF; break; } uap->sysmsg_result = uap->retval; error = ckpt_thaw_proc(td->td_lwp, fp); - fdrop(fp); + dropfp(td, uap->fd, fp); break; default: error = EOPNOTSUPP; diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 95ab3316cf..26a1ef7135 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -168,6 +168,394 @@ fdfixup_locked(struct filedesc *fdp, int fd) } } +/* + * Clear the fd thread caches for this fdnode. + * + * If match_fdc is NULL, all thread caches of fdn will be cleared. + * The caller must hold fdp->fd_spin exclusively. The threads caching + * the descriptor do not have to be the current thread. The (status) + * argument is ignored. + * + * If match_fdc is not NULL, only the match_fdc's cache will be cleared. + * The caller must hold fdp->fd_spin shared and match_fdc must match a + * fdcache entry in curthread. match_fdc has been locked by the caller + * and had the specified (status). + * + * Since we are matching against a fp in the fdp (which must still be present + * at this time), fp will have at least two refs on any match and we can + * decrement the count trivially. + */ +static +void +fclearcache(struct fdnode *fdn, struct fdcache *match_fdc, int status) +{ + struct fdcache *fdc; + struct file *fp; + int i; + + /* + * match_fdc == NULL We are cleaning out all tdcache entries + * for the fdn and hold fdp->fd_spin exclusively. + * This can race against the target threads + * cleaning out specific entries. + * + * match_fdc != NULL We are cleaning out a specific tdcache + * entry on behalf of the owning thread + * and hold fdp->fd_spin shared. The thread + * has already locked the entry. This cannot + * race. + */ + fp = fdn->fp; + for (i = 0; i < NTDCACHEFD; ++i) { + if ((fdc = fdn->tdcache[i]) == NULL) + continue; + + /* + * If match_fdc is non-NULL we are being asked to + * clear a specific fdc owned by curthread. There must + * be exactly one match. The caller has already locked + * the cache entry and will dispose of the lock after + * we return. + * + * Since we also have a shared lock on fdp, we + * can do this without atomic ops. + */ + if (match_fdc) { + if (fdc != match_fdc) + continue; + fdn->tdcache[i] = NULL; + KASSERT(fp == fdc->fp, + ("fclearcache(1): fp mismatch %p/%p\n", + fp, fdc->fp)); + fdc->fp = NULL; + fdc->fd = -1; + + /* + * status can be 0 or 2. If 2 the ref is borrowed, + * if 0 the ref is not borrowed and we have to drop + * it. + */ + if (status == 0) + atomic_add_int(&fp->f_count, -1); + fdn->isfull = 0; /* heuristic */ + return; + } + + /* + * Otherwise we hold an exclusive spin-lock and can only + * race thread consumers borrowing cache entries. + * + * Acquire the lock and dispose of the entry. We have to + * spin until we get the lock. + */ + for (;;) { + status = atomic_swap_int(&fdc->locked, 1); + if (status == 1) { /* foreign lock, retry */ + cpu_pause(); + continue; + } + fdn->tdcache[i] = NULL; + KASSERT(fp == fdc->fp, + ("fclearcache(2): fp mismatch %p/%p\n", + fp, fdc->fp)); + fdc->fp = NULL; + fdc->fd = -1; + if (status == 0) + atomic_add_int(&fp->f_count, -1); + fdn->isfull = 0; /* heuristic */ + atomic_swap_int(&fdc->locked, 0); + break; + } + } + KKASSERT(match_fdc == NULL); +} + +/* + * Retrieve the fp for the specified fd given the specified file descriptor + * table. The fdp does not have to be owned by the current process. + * If flags != -1, fp->f_flag must contain at least one of the flags. + * + * This function is not able to cache the fp. + */ +struct file * +holdfp_fdp(struct filedesc *fdp, int fd, int flag) +{ + struct file *fp; + + spin_lock_shared(&fdp->fd_spin); + if (((u_int)fd) < fdp->fd_nfiles) { + fp = fdp->fd_files[fd].fp; /* can be NULL */ + if (fp) { + if ((fp->f_flag & flag) == 0 && flag != -1) { + fp = NULL; + } else { + fhold(fp); + } + } + } else { + fp = NULL; + } + spin_unlock_shared(&fdp->fd_spin); + + return fp; +} + +/* + * Acquire the fp for the specified file descriptor, using the thread + * cache if possible and caching it if possible. + * + * td must be the curren thread. + */ +static +struct file * +_holdfp_cache(thread_t td, int fd) +{ + struct filedesc *fdp; + struct fdcache *fdc; + struct fdcache *best; + struct fdnode *fdn; + struct file *fp; + int status; + int delta; + int i; + + /* + * Fast + */ + for (fdc = &td->td_fdcache[0]; fdc < &td->td_fdcache[NFDCACHE]; ++fdc) { + if (fdc->fd != fd || fdc->fp == NULL) + continue; + status = atomic_swap_int(&fdc->locked, 1); + + /* + * If someone else has locked our cache entry they are in + * the middle of clearing it, skip the entry. + */ + if (status == 1) + continue; + + /* + * We have locked the entry, but if it no longer matches + * restore the previous state (0 or 2) and skip the entry. + */ + if (fdc->fd != fd || fdc->fp == NULL) { + atomic_swap_int(&fdc->locked, status); + continue; + } + + /* + * We have locked a valid entry. We can borrow the ref + * for a mode 0 entry. We can get a valid fp for a mode + * 2 entry but not borrow the ref. + */ + if (status == 0) { + fp = fdc->fp; + fdc->lru = ++td->td_fdcache_lru; + atomic_swap_int(&fdc->locked, 2); + + return fp; + } + if (status == 2) { + fp = fdc->fp; + fhold(fp); + fdc->lru = ++td->td_fdcache_lru; + atomic_swap_int(&fdc->locked, 2); + + return fp; + } + KKASSERT(0); + } + + /* + * Lookup the descriptor the slow way. This can contend against + * modifying operations in a multi-threaded environment and cause + * cache line ping ponging otherwise. + */ + fdp = td->td_proc->p_fd; + spin_lock_shared(&fdp->fd_spin); + + if (((u_int)fd) < fdp->fd_nfiles) { + fp = fdp->fd_files[fd].fp; /* can be NULL */ + if (fp) { + fhold(fp); + if (fdp->fd_files[fd].isfull == 0) + goto enter; + } + } else { + fp = NULL; + } + spin_unlock_shared(&fdp->fd_spin); + + return fp; + + /* + * We found a valid fp and held it, fdp is still shared locked. + * Enter the fp into the per-thread cache. Find the oldest entry + * via lru, or an empty entry. + * + * Because fdp's spinlock is held (shared is fine), no other + * thread should be in the middle of clearing our selected entry. + */ +enter: + best = &td->td_fdcache[0]; + for (fdc = &td->td_fdcache[0]; fdc < &td->td_fdcache[NFDCACHE]; ++fdc) { + if (fdc->fp == NULL) { + best = fdc; + break; + } + delta = fdc->lru - best->lru; + if (delta < 0) + best = fdc; + } + + /* + * Replace best + * + * Don't enter into the cache if we cannot get the lock. + */ + status = atomic_swap_int(&best->locked, 1); + if (status == 1) + goto done; + + /* + * Clear the previous cache entry if present + */ + if (best->fp) { + KKASSERT(best->fd >= 0); + fclearcache(&fdp->fd_files[best->fd], best, status); + } + + /* + * Create our new cache entry. This entry is 'safe' until we tie + * into the fdnode. If we cannot tie in, we will clear the entry. + */ + best->fd = fd; + best->fp = fp; + best->lru = ++td->td_fdcache_lru; + best->locked = 2; /* borrowed ref */ + + fdn = &fdp->fd_files[fd]; + for (i = 0; i < NTDCACHEFD; ++i) { + if (fdn->tdcache[i] == NULL && + atomic_cmpset_ptr((void **)&fdn->tdcache[i], NULL, best)) { + goto done; + } + } + fdn->isfull = 1; /* no space */ + best->fd = -1; + best->fp = NULL; + best->locked = 0; +done: + spin_unlock_shared(&fdp->fd_spin); + + return fp; +} + +/* + * Drop the file pointer and return to the thread cache if possible. + * + * Caller must not hold fdp's spin lock. + * td must be the current thread. + */ +void +dropfp(thread_t td, int fd, struct file *fp) +{ + struct filedesc *fdp; + struct fdcache *fdc; + int status; + + fdp = td->td_proc->p_fd; + + /* + * If our placeholder is still present we can re-cache the ref. + * + * Note that we can race an fclearcache(). + */ + for (fdc = &td->td_fdcache[0]; fdc < &td->td_fdcache[NFDCACHE]; ++fdc) { + if (fdc->fp != fp || fdc->fd != fd) + continue; + status = atomic_swap_int(&fdc->locked, 1); + switch(status) { + case 0: + /* + * Not in mode 2, fdrop fp without caching. + */ + atomic_swap_int(&fdc->locked, 0); + break; + case 1: + /* + * Not in mode 2, locked by someone else. + * fdrop fp without caching. + */ + break; + case 2: + /* + * Intact borrowed ref, return to mode 0 + * indicating that we have returned the ref. + * + * Return the borrowed ref (2->1->0) + */ + if (fdc->fp == fp && fdc->fd == fd) { + atomic_swap_int(&fdc->locked, 0); + return; + } + atomic_swap_int(&fdc->locked, 2); + break; + } + } + + /* + * Failed to re-cache, drop the fp without caching. + */ + fdrop(fp); +} + +/* + * Clear all descriptors cached in the per-thread fd cache for + * the specified thread. + * + * Caller must not hold p_fd->spin. This function will temporarily + * obtain a shared spin lock. + */ +void +fexitcache(thread_t td) +{ + struct filedesc *fdp; + struct fdcache *fdc; + int status; + int i; + + if (td->td_proc == NULL) + return; + fdp = td->td_proc->p_fd; + if (fdp == NULL) + return; + + /* + * A shared lock is sufficient as the caller controls td and we + * are only clearing td's cache. + */ + spin_lock_shared(&fdp->fd_spin); + for (i = 0; i < NFDCACHE; ++i) { + fdc = &td->td_fdcache[i]; + if (fdc->fp) { + status = atomic_swap_int(&fdc->locked, 1); + if (status == 1) { + cpu_pause(); + --i; + continue; + } + if (fdc->fp) { + KKASSERT(fdc->fd >= 0); + fclearcache(&fdp->fd_files[fdc->fd], fdc, + status); + } + atomic_swap_int(&fdc->locked, 0); + } + } + spin_unlock_shared(&fdp->fd_spin); +} + static __inline struct filelist_head * fp2filelist(const struct file *fp) { @@ -314,7 +702,7 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred) /* * Operations on file pointers */ - if ((fp = holdfp(p->p_fd, fd, -1)) == NULL) + if ((fp = holdfp(td, fd, -1)) == NULL) return (EBADF); switch (cmd) { @@ -653,6 +1041,7 @@ retry: * old descriptor. delfp inherits the ref from the * descriptor table. */ + fclearcache(&fdp->fd_files[new], NULL, 0); delfp = fdp->fd_files[new].fp; fdp->fd_files[new].fp = NULL; fdp->fd_files[new].reserved = 1; @@ -959,6 +1348,9 @@ kern_close(int fd) KKASSERT(p); fdp = p->p_fd; + /* + * funsetfd*() also clears the fd cache + */ spin_lock(&fdp->fd_spin); if ((fp = funsetfd_locked(fdp, fd)) == NULL) { spin_unlock(&fdp->fd_spin); @@ -1004,13 +1396,10 @@ int kern_shutdown(int fd, int how) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; int error; - KKASSERT(p); - - if ((fp = holdfp(p->p_fd, fd, -1)) == NULL) + if ((fp = holdfp(td, fd, -1)) == NULL) return (EBADF); error = fo_shutdown(fp, how); fdrop(fp); @@ -1038,13 +1427,10 @@ int kern_fstat(int fd, struct stat *ub) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; int error; - KKASSERT(p); - - if ((fp = holdfp(p->p_fd, fd, -1)) == NULL) + if ((fp = holdfp(td, fd, -1)) == NULL) return (EBADF); error = fo_stat(fp, ub, td->td_ucred); fdrop(fp); @@ -1077,12 +1463,11 @@ int sys_fpathconf(struct fpathconf_args *uap) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; struct vnode *vp; int error = 0; - if ((fp = holdfp(p->p_fd, uap->fd, -1)) == NULL) + if ((fp = holdfp(td, uap->fd, -1)) == NULL) return (EBADF); switch (fp->f_type) { @@ -1108,10 +1493,6 @@ sys_fpathconf(struct fpathconf_args *uap) return(error); } -static int fdexpand; -SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, - "Number of times a file table has been expanded"); - /* * Grow the file table so it can hold through descriptor (want). * @@ -1162,7 +1543,6 @@ fdgrow_locked(struct filedesc *fdp, int want) kfree(oldfiles, M_FILEDESC); spin_lock(&fdp->fd_spin); } - fdexpand++; } /* @@ -1512,13 +1892,15 @@ fdrevoke_proc_callback(struct proc *p, void *vinfo) spin_unlock(&p->p_spin); /* - * Locate and close any matching file descriptors. + * Locate and close any matching file descriptors, replacing + * them with info->nfp. */ spin_lock(&fdp->fd_spin); for (n = 0; n < fdp->fd_nfiles; ++n) { if ((fp = fdp->fd_files[n].fp) == NULL) continue; if (fp->f_flag & FREVOKED) { + fclearcache(&fdp->fd_files[n], NULL, 0); fhold(info->nfp); fdp->fd_files[n].fp = info->nfp; spin_unlock(&fdp->fd_spin); @@ -1628,10 +2010,8 @@ checkfdclosed(struct filedesc *fdp, int fd, struct file *fp) * This function always succeeds. * * If fp is NULL, the file descriptor is returned to the pool. - */ - -/* - * (exclusive spinlock must be held on call) + * + * Caller must hold an exclusive spinlock on fdp->fd_spin. */ static void fsetfd_locked(struct filedesc *fdp, struct file *fp, int fd) @@ -1640,6 +2020,7 @@ fsetfd_locked(struct filedesc *fdp, struct file *fp, int fd) KKASSERT(fdp->fd_files[fd].reserved != 0); if (fp) { fhold(fp); + fclearcache(&fdp->fd_files[fd], NULL, 0); fdp->fd_files[fd].fp = fp; fdp->fd_files[fd].reserved = 0; } else { @@ -1649,6 +2030,9 @@ fsetfd_locked(struct filedesc *fdp, struct file *fp, int fd) } } +/* + * Caller must hold an exclusive spinlock on fdp->fd_spin. + */ void fsetfd(struct filedesc *fdp, struct file *fp, int fd) { @@ -1658,7 +2042,7 @@ fsetfd(struct filedesc *fdp, struct file *fp, int fd) } /* - * (exclusive spinlock must be held on call) + * Caller must hold an exclusive spinlock on fdp->fd_spin. */ static struct file * @@ -1670,11 +2054,13 @@ funsetfd_locked(struct filedesc *fdp, int fd) return (NULL); if ((fp = fdp->fd_files[fd].fp) == NULL) return (NULL); + fclearcache(&fdp->fd_files[fd], NULL, 0); fdp->fd_files[fd].fp = NULL; fdp->fd_files[fd].fileflags = 0; fdreserve_locked(fdp, fd, -1); fdfixup_locked(fdp, fd); + return(fp); } @@ -1960,6 +2346,9 @@ again: * copied files yet we can ignore the return value from funsetfd(). * * The read spinlock on fdp is still being held. + * + * Be sure to clean out fdnode->tdcache, otherwise bad things will + * happen. */ bcopy(fdp->fd_files, newfdp->fd_files, i * sizeof(struct fdnode)); for (i = 0 ; i < newfdp->fd_nfiles; ++i) { @@ -1969,6 +2358,7 @@ again: fdnode->reserved = 0; fdfixup_locked(newfdp, i); } else if (fdnode->fp) { + bzero(&fdnode->tdcache, sizeof(fdnode->tdcache)); if (fdnode->fp->f_type == DTYPE_KQUEUE) { (void)funsetfd_locked(newfdp, i); } else { @@ -1997,6 +2387,13 @@ fdfree(struct proc *p, struct filedesc *repl) struct vnode *vp; struct flock lf; + /* + * Before destroying or replacing p->p_fd we must be sure to + * clean out the cache of the last thread, which should be + * curthread. + */ + fexitcache(curthread); + /* * Certain daemons might not have file descriptors. */ @@ -2152,91 +2549,82 @@ fdfree(struct proc *p, struct filedesc *repl) /* * Retrieve and reference the file pointer associated with a descriptor. + * + * td must be the current thread. */ struct file * -holdfp(struct filedesc *fdp, int fd, int flag) +holdfp(thread_t td, int fd, int flag) { - struct file* fp; + struct file *fp; - spin_lock_shared(&fdp->fd_spin); - if (((u_int)fd) >= fdp->fd_nfiles) { - fp = NULL; - goto done; - } - if ((fp = fdp->fd_files[fd].fp) == NULL) - goto done; - if ((fp->f_flag & flag) == 0 && flag != -1) { - fp = NULL; - goto done; + fp = _holdfp_cache(td, fd); + if (fp) { + if ((fp->f_flag & flag) == 0 && flag != -1) { + fdrop(fp); + fp = NULL; + } } - fhold(fp); -done: - spin_unlock_shared(&fdp->fd_spin); - return (fp); + return fp; } /* * holdsock() - load the struct file pointer associated * with a socket into *fpp. If an error occurs, non-zero * will be returned and *fpp will be set to NULL. + * + * td must be the current thread. */ int -holdsock(struct filedesc *fdp, int fd, struct file **fpp) +holdsock(thread_t td, int fd, struct file **fpp) { struct file *fp; int error; - spin_lock_shared(&fdp->fd_spin); - if ((unsigned)fd >= fdp->fd_nfiles) { - error = EBADF; - fp = NULL; - goto done; - } - if ((fp = fdp->fd_files[fd].fp) == NULL) { + /* + * Lockless shortcut + */ + fp = _holdfp_cache(td, fd); + if (fp) { + if (fp->f_type != DTYPE_SOCKET) { + fdrop(fp); + fp = NULL; + error = ENOTSOCK; + } else { + error = 0; + } + } else { error = EBADF; - goto done; } - if (fp->f_type != DTYPE_SOCKET) { - error = ENOTSOCK; - goto done; - } - fhold(fp); - error = 0; -done: - spin_unlock_shared(&fdp->fd_spin); *fpp = fp; + return (error); } /* * Convert a user file descriptor to a held file pointer. + * + * td must be the current thread. */ int -holdvnode(struct filedesc *fdp, int fd, struct file **fpp) +holdvnode(thread_t td, int fd, struct file **fpp) { struct file *fp; int error; - spin_lock_shared(&fdp->fd_spin); - if ((unsigned)fd >= fdp->fd_nfiles) { - error = EBADF; - fp = NULL; - goto done; - } - if ((fp = fdp->fd_files[fd].fp) == NULL) { + fp = _holdfp_cache(td, fd); + if (fp) { + if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { + fdrop(fp); + fp = NULL; + error = EINVAL; + } else { + error = 0; + } + } else { error = EBADF; - goto done; - } - if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { - fp = NULL; - error = EINVAL; - goto done; } - fhold(fp); - error = 0; -done: - spin_unlock_shared(&fdp->fd_spin); *fpp = fp; + return (error); } @@ -2299,7 +2687,9 @@ setugidsafety(struct proc *p) } /* - * Close any files on exec? + * Close all CLOEXEC files on exec. + * + * Only a single thread remains for the current process. * * NOT MPSAFE - scans fdp without spinlocks, calls knote_fdclose() */ @@ -2325,6 +2715,8 @@ fdcloseexec(struct proc *p) /* * NULL-out descriptor prior to close to avoid * a race while close blocks. + * + * (funsetfd*() also clears the fd cache) */ if ((fp = funsetfd_locked(fdp, i)) != NULL) { knote_fdclose(fp, fdp, i); @@ -2562,13 +2954,13 @@ fdrop(struct file *fp) int sys_flock(struct flock_args *uap) { - struct proc *p = curproc; + thread_t td = curthread; struct file *fp; struct vnode *vp; struct flock lf; int error; - if ((fp = holdfp(p->p_fd, uap->fd, -1)) == NULL) + if ((fp = holdfp(td, uap->fd, -1)) == NULL) return (EBADF); if (fp->f_type != DTYPE_VNODE) { error = EOPNOTSUPP; @@ -2634,13 +3026,14 @@ fdopen(struct dev_open_args *ap) * must fsetfd() it. On failure the caller will clean it up. */ int -dupfdopen(struct filedesc *fdp, int dfd, int sfd, int mode, int error) +dupfdopen(thread_t td, int dfd, int sfd, int mode, int error) { + struct filedesc *fdp; struct file *wfp; struct file *xfp; int werror; - if ((wfp = holdfp(fdp, sfd, -1)) == NULL) + if ((wfp = holdfp(td, sfd, -1)) == NULL) return (EBADF); /* @@ -2656,6 +3049,8 @@ dupfdopen(struct filedesc *fdp, int dfd, int sfd, int mode, int error) return (werror); } + fdp = td->td_proc->p_fd; + /* * There are two cases of interest here. * diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index e776ce55ef..a8752b9888 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -1027,7 +1027,6 @@ int sys_kevent(struct kevent_args *uap) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct timespec ts, *tsp; struct kqueue *kq; struct file *fp = NULL; @@ -1042,7 +1041,7 @@ sys_kevent(struct kevent_args *uap) } else { tsp = NULL; } - fp = holdfp(p->p_fd, uap->fd, -1); + fp = holdfp(td, uap->fd, -1); if (fp == NULL) return (EBADF); if (fp->f_type != DTYPE_KQUEUE) { @@ -1059,7 +1058,7 @@ sys_kevent(struct kevent_args *uap) error = kern_kevent(kq, uap->nevents, &uap->sysmsg_result, kap, kevent_copyin, kevent_copyout, tsp, 0); - fdrop(fp); + dropfp(td, uap->fd, fp); return (error); } @@ -1091,7 +1090,7 @@ kqueue_register(struct kqueue *kq, struct kevent *kev) if (fops->f_flags & FILTEROP_ISFD) { /* validate descriptor */ - fp = holdfp(fdp, kev->ident, -1); + fp = holdfp_fdp(fdp, kev->ident, -1); if (fp == NULL) return (EBADF); } diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index baa1fcb99b..c9928242f0 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -698,6 +698,12 @@ lwp_exit(int masterexit, void *waddr) plimit_free(rlimit); } + /* + * Cleanup any cached descriptors for this thread + */ + if (p->p_fd) + fexitcache(td); + /* * Nobody actually wakes us when the lock * count reaches zero, so just wait one tick. diff --git a/sys/kern/subr_diskiocom.c b/sys/kern/subr_diskiocom.c index d6039365bc..b67161af35 100644 --- a/sys/kern/subr_diskiocom.c +++ b/sys/kern/subr_diskiocom.c @@ -116,7 +116,7 @@ disk_iocom_ioctl(struct disk *dp, u_long cmd, void *data) switch(cmd) { case DIOCRECLUSTER: recl = data; - fp = holdfp(curproc->p_fd, recl->fd, -1); + fp = holdfp(curthread, recl->fd, -1); if (fp) { error = disk_iocom_reconnect(dp, fp); } else { diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index 2af09a62b2..b2f6258d5e 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -249,13 +249,10 @@ int kern_preadv(int fd, struct uio *auio, int flags, size_t *res) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; int error; - KKASSERT(p); - - fp = holdfp(p->p_fd, fd, FREAD); + fp = holdfp(td, fd, FREAD); if (fp == NULL) return (EBADF); if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) { @@ -263,7 +260,8 @@ kern_preadv(int fd, struct uio *auio, int flags, size_t *res) } else { error = dofileread(fd, fp, auio, flags, res); } - fdrop(fp); + dropfp(td, fd, fp); + return(error); } @@ -455,13 +453,10 @@ int kern_pwritev(int fd, struct uio *auio, int flags, size_t *res) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; int error; - KKASSERT(p); - - fp = holdfp(p->p_fd, fd, FWRITE); + fp = holdfp(td, fd, FWRITE); if (fp == NULL) return (EBADF); else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) { @@ -469,9 +464,9 @@ kern_pwritev(int fd, struct uio *auio, int flags, size_t *res) } else { error = dofilewrite(fd, fp, auio, flags, res); } - - fdrop(fp); - return (error); + dropfp(td, fd, fp); + + return(error); } /* @@ -581,7 +576,7 @@ mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, cred = td->td_ucred; memp = NULL; - fp = holdfp(p->p_fd, fd, FREAD|FWRITE); + fp = holdfp(td, fd, FREAD|FWRITE); if (fp == NULL) return(EBADF); @@ -731,7 +726,8 @@ mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, done: if (memp != NULL) kfree(memp, M_IOCTLOPS); - fdrop(fp); + dropfp(td, fd, fp); + return(error); } diff --git a/sys/kern/sys_mqueue.c b/sys/kern/sys_mqueue.c index 54d18fc4fe..3c44b78f81 100644 --- a/sys/kern/sys_mqueue.c +++ b/sys/kern/sys_mqueue.c @@ -180,7 +180,7 @@ mqueue_get(struct lwp *l, mqd_t mqd, file_t **fpr) struct mqueue *mq; file_t *fp; - fp = holdfp(curproc->p_fd, (int)mqd, -1); /* XXX: Why -1 ? */ + fp = holdfp(curthread, (int)mqd, -1); /* XXX: Why -1 ? */ if (__predict_false(fp == NULL)) return EBADF; diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index 5f51a8fb99..5d30492ba9 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -149,16 +149,15 @@ int kern_bind(int s, struct sockaddr *sa) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; int error; - KKASSERT(p); - error = holdsock(p->p_fd, s, &fp); + error = holdsock(td, s, &fp); if (error) return (error); error = sobind((struct socket *)fp->f_data, sa, td); - fdrop(fp); + dropfp(td, s, fp); + return (error); } @@ -186,17 +185,16 @@ int kern_listen(int s, int backlog) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; int error; - KKASSERT(p); - error = holdsock(p->p_fd, s, &fp); + error = holdsock(td, s, &fp); if (error) return (error); error = solisten((struct socket *)fp->f_data, backlog, td); - fdrop(fp); - return(error); + dropfp(td, s, fp); + + return (error); } /* @@ -290,7 +288,7 @@ kern_accept(int s, int fflags, struct sockaddr **name, int *namelen, int *res, if (name && namelen && *namelen < 0) return (EINVAL); - error = holdsock(td->td_proc->p_fd, s, &lfp); + error = holdsock(td, s, &lfp); if (error) return (error); @@ -417,7 +415,8 @@ done: fsetfd(fdp, nfp, fd); } fdrop(nfp); - fdrop(lfp); + dropfp(td, s, lfp); + return (error); } @@ -553,12 +552,11 @@ int kern_connect(int s, int fflags, struct sockaddr *sa) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; struct socket *so; int error, interrupted = 0; - error = holdsock(p->p_fd, s, &fp); + error = holdsock(td, s, &fp); if (error) return (error); so = (struct socket *)fp->f_data; @@ -605,7 +603,8 @@ bad: if (error == ERESTART) error = EINTR; done: - fdrop(fp); + dropfp(td, s, fp); + return (error); } @@ -761,7 +760,7 @@ kern_sendmsg(int s, struct sockaddr *sa, struct uio *auio, struct uio ktruio; #endif - error = holdsock(p->p_fd, s, &fp); + error = holdsock(td, s, &fp); if (error) return (error); #ifdef KTRACE @@ -800,7 +799,8 @@ kern_sendmsg(int s, struct sockaddr *sa, struct uio *auio, #endif if (error == 0) *res = len - auio->uio_resid; - fdrop(fp); + dropfp(td, s, fp); + return (error); } @@ -928,7 +928,6 @@ kern_recvmsg(int s, struct sockaddr **sa, struct uio *auio, struct mbuf **control, int *flags, size_t *res) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; size_t len; int error; @@ -939,7 +938,7 @@ kern_recvmsg(int s, struct sockaddr **sa, struct uio *auio, struct uio ktruio; #endif - error = holdsock(p->p_fd, s, &fp); + error = holdsock(td, s, &fp); if (error) return (error); #ifdef KTRACE @@ -983,7 +982,8 @@ kern_recvmsg(int s, struct sockaddr **sa, struct uio *auio, #endif if (error == 0) *res = len - auio->uio_resid; - fdrop(fp); + dropfp(td, s, fp); + return (error); } @@ -1169,7 +1169,6 @@ int kern_setsockopt(int s, struct sockopt *sopt) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; int error; @@ -1180,12 +1179,13 @@ kern_setsockopt(int s, struct sockopt *sopt) if (sopt->sopt_valsize > SOMAXOPT_SIZE) /* unsigned */ return (EINVAL); - error = holdsock(p->p_fd, s, &fp); + error = holdsock(td, s, &fp); if (error) return (error); error = sosetopt((struct socket *)fp->f_data, sopt); - fdrop(fp); + dropfp(td, s, fp); + return (error); } @@ -1232,7 +1232,6 @@ int kern_getsockopt(int s, struct sockopt *sopt) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; int error; @@ -1241,12 +1240,13 @@ kern_getsockopt(int s, struct sockopt *sopt) if (sopt->sopt_val != NULL && sopt->sopt_valsize == 0) return (EINVAL); - error = holdsock(p->p_fd, s, &fp); + error = holdsock(td, s, &fp); if (error) return (error); error = sogetopt((struct socket *)fp->f_data, sopt); - fdrop(fp); + dropfp(td, s, fp); + return (error); } @@ -1319,13 +1319,12 @@ int kern_getsockname(int s, struct sockaddr **name, int *namelen) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; struct socket *so; struct sockaddr *sa = NULL; int error; - error = holdsock(p->p_fd, s, &fp); + error = holdsock(td, s, &fp); if (error) return (error); if (*namelen < 0) { @@ -1342,8 +1341,8 @@ kern_getsockname(int s, struct sockaddr **name, int *namelen) *name = sa; } } + dropfp(td, s, fp); - fdrop(fp); return (error); } @@ -1385,13 +1384,12 @@ int kern_getpeername(int s, struct sockaddr **name, int *namelen) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; struct socket *so; struct sockaddr *sa = NULL; int error; - error = holdsock(p->p_fd, s, &fp); + error = holdsock(td, s, &fp); if (error) return (error); if (*namelen < 0) { @@ -1412,8 +1410,8 @@ kern_getpeername(int s, struct sockaddr **name, int *namelen) *name = sa; } } + dropfp(td, s, fp); - fdrop(fp); return (error); } @@ -1526,7 +1524,6 @@ int sys_sendfile(struct sendfile_args *uap) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; struct vnode *vp = NULL; struct sf_hdtr hdtr; @@ -1539,13 +1536,11 @@ sys_sendfile(struct sendfile_args *uap) off_t sbytes; int error; - KKASSERT(p); - /* * Do argument checking. Must be a regular file in, stream * type and connected socket out, positive offset. */ - fp = holdfp(p->p_fd, uap->fd, FREAD); + fp = holdfp(td, uap->fd, FREAD); if (fp == NULL) { return (EBADF); } @@ -1555,7 +1550,7 @@ sys_sendfile(struct sendfile_args *uap) } vp = (struct vnode *)fp->f_data; vref(vp); - fdrop(fp); + dropfp(td, uap->fd, fp); /* * If specified, get the pointer to the sf_hdtr struct for @@ -1633,7 +1628,6 @@ kern_sendfile(struct vnode *vp, int sfd, off_t offset, size_t nbytes, struct mbuf *mheader, off_t *sbytes, int flags) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct vm_object *obj; struct socket *so; struct file *fp; @@ -1652,7 +1646,7 @@ kern_sendfile(struct vnode *vp, int sfd, off_t offset, size_t nbytes, error = EINVAL; goto done0; } - error = holdsock(p->p_fd, sfd, &fp); + error = holdsock(td, sfd, &fp); if (error) goto done0; so = (struct socket *)fp->f_data; @@ -1921,7 +1915,7 @@ done: vm_object_drop(obj); ssb_unlock(&so->so_snd); done1: - fdrop(fp); + dropfp(td, sfd, fp); done0: if (mheader != NULL) m_freem(mheader); diff --git a/sys/kern/vfs_nlookup.c b/sys/kern/vfs_nlookup.c index 7645430d2f..c3c6fd56b6 100644 --- a/sys/kern/vfs_nlookup.c +++ b/sys/kern/vfs_nlookup.c @@ -152,7 +152,6 @@ nlookup_init_at(struct nlookupdata *nd, struct file **fpp, int fd, const char *path, enum uio_seg seg, int flags) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file* fp; struct vnode *vp; int error; @@ -164,7 +163,7 @@ nlookup_init_at(struct nlookupdata *nd, struct file **fpp, int fd, } if (nd->nl_path[0] != '/' && fd != AT_FDCWD) { - if ((error = holdvnode(p->p_fd, fd, &fp)) != 0) + if ((error = holdvnode(td, fd, &fp)) != 0) goto done; vp = (struct vnode*)fp->f_data; if (vp->v_type != VDIR || fp->f_nchandle.ncp == NULL) { diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 4024592a1b..f5ab24a19e 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -80,7 +80,7 @@ static void mount_warning(struct mount *mp, const char *ctl, ...) static int mount_path(struct proc *p, struct mount *mp, char **rb, char **fb); static int checkvp_chdir (struct vnode *vn, struct thread *td); static void checkdirs (struct nchandle *old_nch, struct nchandle *new_nch); -static int chroot_refuse_vdir_fds (struct filedesc *fdp); +static int chroot_refuse_vdir_fds (thread_t td, struct filedesc *fdp); static int chroot_visible_mnt(struct mount *mp, struct proc *p); static int getutimes (struct timeval *, struct timespec *); static int getutimens (const struct timespec *, struct timespec *, int *); @@ -1100,7 +1100,6 @@ int sys_mountctl(struct mountctl_args *uap) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; void *ctl = NULL; void *buf = NULL; @@ -1110,7 +1109,6 @@ sys_mountctl(struct mountctl_args *uap) /* * Sanity and permissions checks. We must be root. */ - KKASSERT(p); if (td->td_ucred->cr_prison != NULL) return (EPERM); if ((uap->op != MOUNTCTL_MOUNTFLAGS) && @@ -1148,7 +1146,7 @@ sys_mountctl(struct mountctl_args *uap) * Validate the descriptor */ if (uap->fd >= 0) { - fp = holdfp(p->p_fd, uap->fd, -1); + fp = holdfp(td, uap->fd, -1); if (fp == NULL) { error = EBADF; goto done; @@ -1160,9 +1158,10 @@ sys_mountctl(struct mountctl_args *uap) /* * Execute the internal kernel function and clean up. */ - error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen, buf, uap->buflen, &uap->sysmsg_result); + error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen, + buf, uap->buflen, &uap->sysmsg_result); if (fp) - fdrop(fp); + dropfp(td, uap->fd, fp); if (error == 0 && uap->sysmsg_result > 0) error = copyout(buf, uap->buf, uap->sysmsg_result); done: @@ -1310,7 +1309,7 @@ kern_fstatfs(int fd, struct statfs *buf) int error; KKASSERT(p); - if ((error = holdvnode(p->p_fd, fd, &fp)) != 0) + if ((error = holdvnode(td, fd, &fp)) != 0) return (error); /* @@ -1416,14 +1415,12 @@ int kern_fstatvfs(int fd, struct statvfs *buf) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; struct mount *mp; struct statvfs *sp; int error; - KKASSERT(p); - if ((error = holdvnode(p->p_fd, fd, &fp)) != 0) + if ((error = holdvnode(td, fd, &fp)) != 0) return (error); if ((mp = fp->f_nchandle.mount) == NULL) mp = ((struct vnode *)fp->f_data)->v_mount; @@ -1681,7 +1678,7 @@ sys_fchdir(struct fchdir_args *uap) struct nchandle nch, onch, tnch; int error; - if ((error = holdvnode(fdp, uap->fd, &fp)) != 0) + if ((error = holdvnode(td, uap->fd, &fp)) != 0) return (error); lwkt_gettoken(&p->p_token); vp = (struct vnode *)fp->f_data; @@ -1799,7 +1796,7 @@ sys_chdir(struct chdir_args *uap) * any filedescriptors are open directories. */ static int -chroot_refuse_vdir_fds(struct filedesc *fdp) +chroot_refuse_vdir_fds(thread_t td, struct filedesc *fdp) { struct vnode *vp; struct file *fp; @@ -1807,7 +1804,7 @@ chroot_refuse_vdir_fds(struct filedesc *fdp) int fd; for (fd = 0; fd < fdp->fd_nfiles ; fd++) { - if ((error = holdvnode(fdp, fd, &fp)) != 0) + if ((error = holdvnode(td, fd, &fp)) != 0) continue; vp = (struct vnode *)fp->f_data; if (vp->v_type != VDIR) { @@ -1859,7 +1856,7 @@ kern_chroot(struct nchandle *nch) */ if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { - if ((error = chroot_refuse_vdir_fds(fdp)) != 0) + if ((error = chroot_refuse_vdir_fds(td, fdp)) != 0) return (error); } if ((vp = nch->ncp->nc_vp) == NULL) @@ -2042,7 +2039,7 @@ kern_open(struct nlookupdata *nd, int oflags, int mode, int *res) */ if ((error == ENODEV || error == ENXIO) && lp->lwp_dupfd >= 0) { if (fdalloc(p, 0, &indx) == 0) { - error = dupfdopen(fdp, indx, lp->lwp_dupfd, flags, error); + error = dupfdopen(td, indx, lp->lwp_dupfd, flags, error); if (error == 0) { *res = indx; fdrop(fp); /* our ref */ @@ -2123,6 +2120,7 @@ kern_open(struct nlookupdata *nd, int oflags, int mode, int *res) fsetfd(fdp, fp, indx); fdrop(fp); *res = indx; + return (error); } @@ -2682,14 +2680,13 @@ int kern_lseek(int fd, off_t offset, int whence, off_t *res) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; struct vnode *vp; struct vattr vattr; off_t new_offset; int error; - fp = holdfp(p->p_fd, fd, -1); + fp = holdfp(td, fd, -1); if (fp == NULL) return (EBADF); if (fp->f_type != DTYPE_VNODE) { @@ -2741,7 +2738,8 @@ kern_lseek(int fd, off_t offset, int whence, off_t *res) *res = fp->f_offset; spin_unlock(&fp->f_spin); done: - fdrop(fp); + dropfp(td, fd, fp); + return (error); } @@ -3204,11 +3202,10 @@ int sys_fchflags(struct fchflags_args *uap) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; int error; - if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0) + if ((error = holdvnode(td, uap->fd, &fp)) != 0) return (error); if (fp->f_nchandle.ncp) error = ncp_writechk(&fp->f_nchandle); @@ -3334,11 +3331,10 @@ int sys_fchmod(struct fchmod_args *uap) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct file *fp; int error; - if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0) + if ((error = holdvnode(td, uap->fd, &fp)) != 0) return (error); if (fp->f_nchandle.ncp) error = ncp_writechk(&fp->f_nchandle); @@ -3478,7 +3474,7 @@ sys_fchown(struct fchown_args *uap) struct file *fp; int error; - if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0) + if ((error = holdvnode(td, uap->fd, &fp)) != 0) return (error); if (fp->f_nchandle.ncp) error = ncp_writechk(&fp->f_nchandle); @@ -3661,7 +3657,6 @@ int kern_futimens(int fd, struct timespec *ts) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct timespec newts[2]; struct file *fp; struct vnode *vp; @@ -3672,7 +3667,7 @@ kern_futimens(int fd, struct timespec *ts) error = getutimens(ts, newts, &nullflag); if (error) return (error); - if ((error = holdvnode(p->p_fd, fd, &fp)) != 0) + if ((error = holdvnode(td, fd, &fp)) != 0) return (error); if (fp->f_nchandle.ncp) error = ncp_writechk(&fp->f_nchandle); @@ -3881,7 +3876,6 @@ int kern_ftruncate(int fd, off_t length) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct vattr vattr; struct vnode *vp; struct file *fp; @@ -3893,7 +3887,7 @@ kern_ftruncate(int fd, off_t length) if (length < 0) return(EINVAL); - if ((error = holdvnode(p->p_fd, fd, &fp)) != 0) + if ((error = holdvnode(td, fd, &fp)) != 0) return (error); if (fp->f_nchandle.ncp) { error = ncp_writechk(&fp->f_nchandle); @@ -3961,13 +3955,12 @@ int sys_fsync(struct fsync_args *uap) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct vnode *vp; struct file *fp; vm_object_t obj; int error; - if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0) + if ((error = holdvnode(td, uap->fd, &fp)) != 0) return (error); vp = (struct vnode *)fp->f_data; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); @@ -4365,7 +4358,6 @@ kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res, enum uio_seg direction) { struct thread *td = curthread; - struct proc *p = td->td_proc; struct vnode *vp; struct file *fp; struct uio auio; @@ -4373,7 +4365,7 @@ kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res, off_t loff; int error, eofflag; - if ((error = holdvnode(p->p_fd, fd, &fp)) != 0) + if ((error = holdvnode(td, fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { error = EBADF; diff --git a/sys/netproto/smb/smb_dev.c b/sys/netproto/smb/smb_dev.c index 044646dea5..7ac5faa9b2 100644 --- a/sys/netproto/smb/smb_dev.c +++ b/sys/netproto/smb/smb_dev.c @@ -388,7 +388,7 @@ smb_dev2share(int fd, int mode, struct smb_cred *scred, KKASSERT(scred->scr_td->td_proc); - fp = holdfp(scred->scr_td->td_proc->p_fd, fd, FREAD|FWRITE); + fp = holdfp_fdp(scred->scr_td->td_proc->p_fd, fd, FREAD|FWRITE); if (fp == NULL) return EBADF; diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h index 0acb811e3c..65d044ad49 100644 --- a/sys/sys/filedesc.h +++ b/sys/sys/filedesc.h @@ -61,17 +61,20 @@ * the resource limit is reached. */ #define NDFILE 15 /* must be of the form 2^n - 1 */ +#define NTDCACHEFD 4 /* max td's caching same fd */ struct file; struct klist; +struct fdcache; struct fdnode { struct file *fp; char fileflags; - char unused01; - char unused02; + char isfull; + char iterator; char reserved; /* descriptor has been reserved */ int allocated; /* subtree allocation count */ + struct fdcache *tdcache[NTDCACHEFD]; }; struct filedesc { @@ -148,7 +151,7 @@ struct lwp; /* * Kernel global variables and routines. */ -int dupfdopen (struct filedesc *, int, int, int, int); +int dupfdopen (struct thread *, int, int, int, int); int fdalloc (struct proc *p, int want, int *result); int fdavail (struct proc *p, int n); int falloc (struct lwp *lp, struct file **resultfp, int *resultfd); @@ -166,9 +169,11 @@ int fdrevoke(void *f_data, short f_type, struct ucred *cred); int closef (struct file *fp, struct proc *p); void fdcloseexec (struct proc *p); int fdcheckstd (struct lwp *lp); -struct file *holdfp (struct filedesc *fdp, int fd, int flag); -int holdsock (struct filedesc *fdp, int fdes, struct file **fpp); -int holdvnode (struct filedesc *fdp, int fd, struct file **fpp); +struct file *holdfp (struct thread *td, int fd, int flag); +struct file *holdfp_fdp (struct filedesc *fdp, int fd, int flag); +int holdsock (struct thread *td, int fdes, struct file **fpp); +int holdvnode (struct thread *td, int fd, struct file **fpp); +void dropfp(struct thread *td, int fd, struct file *fp); int fdissequential (struct file *); void fdsequential (struct file *, int); pid_t fgetown (struct sigio **); @@ -177,6 +182,7 @@ void funsetown (struct sigio **); void funsetownlst (struct sigiolst *); void setugidsafety (struct proc *p); void allfiles_scan_exclusive(int (*callback)(struct file *, void *), void *data); +void fexitcache(struct thread *td); struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, diff --git a/sys/sys/thread.h b/sys/sys/thread.h index 7e6f1c8902..395a690e36 100644 --- a/sys/sys/thread.h +++ b/sys/sys/thread.h @@ -48,6 +48,7 @@ struct lwkt_ipiq; struct lwkt_cpu_msg; struct lwkt_cpu_port; struct lwkt_cpusync; +struct fdnode; union sysunion; typedef struct lwkt_queue *lwkt_queue_t; @@ -216,6 +217,19 @@ typedef struct lwkt_cpu_msg { thread_t cm_originator; /* originating thread for wakeup */ } lwkt_cpu_msg; +/* + * per-thread file descriptor cache + */ +struct fdcache { + int fd; /* descriptor being cached */ + int locked; + struct file *fp; /* cached referenced fp */ + int lru; + int unused[3]; +} __cachealign; + +#define NFDCACHE 4 /* max fd's cached by a thread */ + /* * Thread structure. Note that ownership of a thread structure is special * cased and there is no 'token'. A thread is always owned by the cpu @@ -265,7 +279,8 @@ struct thread { int td_upri; /* user priority (sub-priority under td_pri) */ int td_type; /* thread type, TD_TYPE_ */ int td_tracker; /* for callers to debug lock counts */ - int td_unused03[4]; /* for future fields */ + int td_fdcache_lru; + int td_unused03[3]; /* for future fields */ struct iosched_data td_iosdata; /* Dynamic I/O scheduling data */ struct timeval td_start; /* start time for a thread/process */ char td_comm[MAXCOMLEN+1]; /* typ 16+1 bytes */ @@ -278,6 +293,7 @@ struct thread { int td_fairq_load; /* fairq */ int td_fairq_count; /* fairq */ struct globaldata *td_migrate_gd; /* target gd for thread migration */ + struct fdcache td_fdcache[NFDCACHE]; #ifdef DEBUG_CRIT_SECTIONS #define CRIT_DEBUG_ARRAY_SIZE 32 #define CRIT_DEBUG_ARRAY_MASK (CRIT_DEBUG_ARRAY_SIZE - 1) diff --git a/sys/vfs/hammer2/hammer2_ioctl.c b/sys/vfs/hammer2/hammer2_ioctl.c index 07c4caab0f..69a4851c52 100644 --- a/sys/vfs/hammer2/hammer2_ioctl.c +++ b/sys/vfs/hammer2/hammer2_ioctl.c @@ -190,7 +190,7 @@ hammer2_ioctl_recluster(hammer2_inode_t *ip, void *data) hammer2_cluster_t *cluster; int error; - fp = holdfp(curproc->p_fd, recl->fd, -1); + fp = holdfp(curthread, recl->fd, -1); if (fp) { error = VFS_ROOT(ip->pmp->mp, &vproot); if (error == 0) { diff --git a/sys/vfs/hammer2/hammer2_vfsops.c b/sys/vfs/hammer2/hammer2_vfsops.c index afd124654e..1db125a3c9 100644 --- a/sys/vfs/hammer2/hammer2_vfsops.c +++ b/sys/vfs/hammer2/hammer2_vfsops.c @@ -1308,7 +1308,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, * Root mounts typically do not supply one. */ if (info.cluster_fd >= 0) { - fp = holdfp(curproc->p_fd, info.cluster_fd, -1); + fp = holdfp(curthread, info.cluster_fd, -1); if (fp) { hammer2_cluster_reconnect(hmp, fp); } else { diff --git a/sys/vfs/nfs/nfs_syscalls.c b/sys/vfs/nfs/nfs_syscalls.c index 3afddd940b..10626c9a06 100644 --- a/sys/vfs/nfs/nfs_syscalls.c +++ b/sys/vfs/nfs/nfs_syscalls.c @@ -190,7 +190,7 @@ sys_nfssvc(struct nfssvc_args *uap) error = copyin(uap->argp, (caddr_t)&nfsdarg, sizeof(nfsdarg)); if (error) goto done; - error = holdsock(td->td_proc->p_fd, nfsdarg.sock, &fp); + error = holdsock(td, nfsdarg.sock, &fp); if (error) goto done; /* diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 27530b72b8..c1e20c1c58 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -246,7 +246,7 @@ kern_mmap(struct vmspace *vms, caddr_t uaddr, size_t ulen, * Mapping file, get fp for validation. Obtain vnode and make * sure it is of appropriate type. */ - fp = holdfp(p->p_fd, fd, -1); + fp = holdfp(td, fd, -1); if (fp == NULL) return (EBADF); if (fp->f_type != DTYPE_VNODE) { @@ -397,7 +397,7 @@ kern_mmap(struct vmspace *vms, caddr_t uaddr, size_t ulen, lwkt_reltoken(&vms->vm_map.token); done: if (fp) - fdrop(fp); + dropfp(td, fd, fp); return (error); } -- 2.41.0