From d629916367683a6eed53be1506f361d512724496 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 21 Apr 2018 17:30:42 -0700 Subject: [PATCH] kernel - Remove SMP bottlenecks on uidinfo, descriptors, and lockf * Use an eventcounter and the per-thread fd cache to fix bottlenecks in checkfdclosed(). This will work well for the vast majority of applications and test benches. * Batch holdfp*() operations on kqueue collections when implementing poll() and select(). This significant improves performance. Full scaling not yet achieved, however. * Increase copyin item batching from 8 to 32 for select() and poll(). * Give the uidinfo structure a pcpu array to hold the posixlocks and openfiles count fields, with a rollup contained in the uidinfo structure itself. This removes numerous global bottlenecks related to open(), close(), dup*(), and lockf operations (posixlocks count). ui_openfiles will force a rollup on limit reached to be sure that the limit was actually reached. ui_posixlocks stays fairly loose. Each cpu rolls up generally only when the pcpu count exceeds +32 or goes below -32. * Give the proc structure a pcpu array for the same counts, in order to properly support seteuid() and such. * Replace P_ADVLOCK with a char field proc->p_advlock_flag, and remove token operations around the field. --- sys/kern/kern_descrip.c | 149 ++++++++++++++++++++--------- sys/kern/kern_event.c | 198 ++++++++++++++++++++++++++++++--------- sys/kern/kern_exit.c | 3 +- sys/kern/kern_fork.c | 2 + sys/kern/kern_lockf.c | 44 +++++---- sys/kern/kern_resource.c | 6 +- sys/kern/sys_generic.c | 17 +++- sys/sys/event.h | 3 +- sys/sys/eventvar.h | 2 +- sys/sys/file.h | 3 +- sys/sys/filedesc.h | 2 + sys/sys/proc.h | 8 +- sys/sys/resourcevar.h | 19 +++- 13 files changed, 334 insertions(+), 122 deletions(-) diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 26a1ef7135..c69696ffe7 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -300,6 +300,26 @@ holdfp_fdp(struct filedesc *fdp, int fd, int flag) return fp; } +struct file * +holdfp_fdp_locked(struct filedesc *fdp, int fd, int flag) +{ + struct file *fp; + + if (((u_int)fd) < fdp->fd_nfiles) { + fp = fdp->fd_files[fd].fp; /* can be NULL */ + if (fp) { + if ((fp->f_flag & flag) == 0 && flag != -1) { + fp = NULL; + } else { + fhold(fp); + } + } + } else { + fp = NULL; + } + return fp; +} + /* * Acquire the fp for the specified file descriptor, using the thread * cache if possible and caching it if possible. @@ -656,6 +676,7 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred) u_int newmin; u_int oflags; u_int nflags; + int closedcounter; int tmp, error, flg = F_POSIX; KKASSERT(p); @@ -702,6 +723,7 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred) /* * Operations on file pointers */ + closedcounter = p->p_fd->fd_closedcounter; if ((fp = holdfp(td, fd, -1)) == NULL) return (EBADF); @@ -771,30 +793,24 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred) error = EBADF; break; } - if ((p->p_leader->p_flags & P_ADVLOCK) == 0) { - lwkt_gettoken(&p->p_leader->p_token); - p->p_leader->p_flags |= P_ADVLOCK; - lwkt_reltoken(&p->p_leader->p_token); - } + if (p->p_leader->p_advlock_flag == 0) + p->p_leader->p_advlock_flag = 1; error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, - &dat->fc_flock, flg); + &dat->fc_flock, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } - if ((p->p_leader->p_flags & P_ADVLOCK) == 0) { - lwkt_gettoken(&p->p_leader->p_token); - p->p_leader->p_flags |= P_ADVLOCK; - lwkt_reltoken(&p->p_leader->p_token); - } + if (p->p_leader->p_advlock_flag == 0) + p->p_leader->p_advlock_flag = 1; error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, - &dat->fc_flock, flg); + &dat->fc_flock, flg); break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, - &dat->fc_flock, F_POSIX); + &dat->fc_flock, F_POSIX); break; default: error = EINVAL; @@ -806,13 +822,13 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred) * we were blocked getting the lock. If this occurs the * close might not have caught the lock. */ - if (checkfdclosed(p->p_fd, fd, fp)) { + if (checkfdclosed(td, p->p_fd, fd, fp, closedcounter)) { dat->fc_flock.l_whence = SEEK_SET; dat->fc_flock.l_start = 0; dat->fc_flock.l_len = 0; dat->fc_flock.l_type = F_UNLCK; - (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, - F_UNLCK, &dat->fc_flock, F_POSIX); + VOP_ADVLOCK(vp, (caddr_t)p->p_leader, + F_UNLCK, &dat->fc_flock, F_POSIX); } break; @@ -834,7 +850,7 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred) if (dat->fc_flock.l_whence == SEEK_CUR) dat->fc_flock.l_start += fp->f_offset; error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, - &dat->fc_flock, F_POSIX); + &dat->fc_flock, F_POSIX); break; default: error = EINVAL; @@ -1041,7 +1057,9 @@ retry: * old descriptor. delfp inherits the ref from the * descriptor table. */ + ++fdp->fd_closedcounter; fclearcache(&fdp->fd_files[new], NULL, 0); + ++fdp->fd_closedcounter; delfp = fdp->fd_files[new].fp; fdp->fd_files[new].fp = NULL; fdp->fd_files[new].reserved = 1; @@ -1622,17 +1640,32 @@ fdalloc(struct proc *p, int want, int *result) * Check that the user has not run out of descriptors (non-root only). * As a safety measure the dtable is allowed to have at least * minfilesperproc open fds regardless of the maxfilesperuser limit. + * + * This isn't as loose a spec as ui_posixlocks, so we use atomic + * ops to force synchronize and recheck if we would otherwise + * error. */ if (p->p_ucred->cr_uid && fdp->fd_nfiles >= minfilesperproc) { uip = p->p_ucred->cr_uidinfo; if (uip->ui_openfiles > maxfilesperuser) { - krateprintf(&krate_uidinfo, - "Warning: user %d pid %d (%s) ran out of " - "file descriptors (%d/%d)\n", - p->p_ucred->cr_uid, (int)p->p_pid, - p->p_comm, - uip->ui_openfiles, maxfilesperuser); - return(ENFILE); + int n; + int count; + + for (n = 0; n < ncpus; ++n) { + count = atomic_swap_int( + &uip->ui_pcpu[n].pu_openfiles, 0); + atomic_add_int(&uip->ui_openfiles, count); + } + if (uip->ui_openfiles > maxfilesperuser) { + krateprintf(&krate_uidinfo, + "Warning: user %d pid %d (%s) " + "ran out of file descriptors " + "(%d/%d)\n", + p->p_ucred->cr_uid, (int)p->p_pid, + p->p_comm, + uip->ui_openfiles, maxfilesperuser); + return(ENFILE); + } } } @@ -1900,7 +1933,9 @@ fdrevoke_proc_callback(struct proc *p, void *vinfo) if ((fp = fdp->fd_files[n].fp) == NULL) continue; if (fp->f_flag & FREVOKED) { + ++fdp->fd_closedcounter; fclearcache(&fdp->fd_files[n], NULL, 0); + ++fdp->fd_closedcounter; fhold(info->nfp); fdp->fd_files[n].fp = info->nfp; spin_unlock(&fdp->fd_spin); @@ -1992,10 +2027,24 @@ done: * and a close is not currently in progress. */ int -checkfdclosed(struct filedesc *fdp, int fd, struct file *fp) +checkfdclosed(thread_t td, struct filedesc *fdp, int fd, struct file *fp, + int closedcounter) { + struct fdcache *fdc; int error; + cpu_lfence(); + if (fdp->fd_closedcounter == closedcounter) + return 0; + + if (td->td_proc && td->td_proc->p_fd == fdp) { + for (fdc = &td->td_fdcache[0]; + fdc < &td->td_fdcache[NFDCACHE]; ++fdc) { + if (fdc->fd == fd && fdc->fp == fp) + return 0; + } + } + spin_lock_shared(&fdp->fd_spin); if ((unsigned)fd >= fdp->fd_nfiles || fp != fdp->fd_files[fd].fp) error = EBADF; @@ -2054,9 +2103,11 @@ funsetfd_locked(struct filedesc *fdp, int fd) return (NULL); if ((fp = fdp->fd_files[fd].fp) == NULL) return (NULL); + ++fdp->fd_closedcounter; fclearcache(&fdp->fd_files[fd], NULL, 0); fdp->fd_files[fd].fp = NULL; fdp->fd_files[fd].fileflags = 0; + ++fdp->fd_closedcounter; fdreserve_locked(fdp, fd, -1); fdfixup_locked(fdp, fd); @@ -2135,16 +2186,31 @@ fsetcred(struct file *fp, struct ucred *ncr) { struct ucred *ocr; struct uidinfo *uip; + struct uidcount *pup; + int cpu = mycpuid; + int count; ocr = fp->f_cred; if (ocr == NULL || ncr == NULL || ocr->cr_uidinfo != ncr->cr_uidinfo) { if (ocr) { uip = ocr->cr_uidinfo; - atomic_add_int(&uip->ui_openfiles, -1); + pup = &uip->ui_pcpu[cpu]; + atomic_add_int(&pup->pu_openfiles, -1); + if (pup->pu_openfiles < -PUP_LIMIT || + pup->pu_openfiles > PUP_LIMIT) { + count = atomic_swap_int(&pup->pu_openfiles, 0); + atomic_add_int(&uip->ui_openfiles, count); + } } if (ncr) { uip = ncr->cr_uidinfo; - atomic_add_int(&uip->ui_openfiles, 1); + pup = &uip->ui_pcpu[cpu]; + atomic_add_int(&pup->pu_openfiles, 1); + if (pup->pu_openfiles < -PUP_LIMIT || + pup->pu_openfiles > PUP_LIMIT) { + count = atomic_swap_int(&pup->pu_openfiles, 0); + atomic_add_int(&uip->ui_openfiles, count); + } } } if (ncr) @@ -2414,8 +2480,7 @@ fdfree(struct proc *p, struct filedesc *repl) KASSERT(fdtol->fdl_refcount > 0, ("filedesc_to_refcount botch: fdl_refcount=%d", fdtol->fdl_refcount)); - if (fdtol->fdl_refcount == 1 && - (p->p_leader->p_flags & P_ADVLOCK) != 0) { + if (fdtol->fdl_refcount == 1 && p->p_leader->p_advlock_flag) { for (i = 0; i <= fdp->fd_lastfile; ++i) { fdnode = &fdp->fd_files[i]; if (fdnode->fp == NULL || @@ -2431,11 +2496,8 @@ fdfree(struct proc *p, struct filedesc *repl) lf.l_len = 0; lf.l_type = F_UNLCK; vp = (struct vnode *)fp->f_data; - (void) VOP_ADVLOCK(vp, - (caddr_t)p->p_leader, - F_UNLCK, - &lf, - F_POSIX); + VOP_ADVLOCK(vp, (caddr_t)p->p_leader, + F_UNLCK, &lf, F_POSIX); fdrop(fp); spin_lock(&fdp->fd_spin); } @@ -2443,7 +2505,7 @@ fdfree(struct proc *p, struct filedesc *repl) retry: if (fdtol->fdl_refcount == 1) { if (fdp->fd_holdleaderscount > 0 && - (p->p_leader->p_flags & P_ADVLOCK) != 0) { + p->p_leader->p_advlock_flag) { /* * close() or do_dup() has cleared a reference * in a shared file descriptor table. @@ -2808,18 +2870,19 @@ closef(struct file *fp, struct proc *p) if (p != NULL && fp->f_type == DTYPE_VNODE && (((struct vnode *)fp->f_data)->v_flag & VMAYHAVELOCKS) ) { - if ((p->p_leader->p_flags & P_ADVLOCK) != 0) { + if (p->p_leader->p_advlock_flag) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = (struct vnode *)fp->f_data; - (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, - &lf, F_POSIX); + VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, + &lf, F_POSIX); } fdtol = p->p_fdtol; if (fdtol != NULL) { lwkt_gettoken(&p->p_token); + /* * Handle special case where file descriptor table * is shared between multiple process leaders. @@ -2827,8 +2890,7 @@ closef(struct file *fp, struct proc *p) for (fdtol = fdtol->fdl_next; fdtol != p->p_fdtol; fdtol = fdtol->fdl_next) { - if ((fdtol->fdl_leader->p_flags & - P_ADVLOCK) == 0) + if (fdtol->fdl_leader->p_advlock_flag == 0) continue; fdtol->fdl_holdcount++; lf.l_whence = SEEK_SET; @@ -2836,9 +2898,8 @@ closef(struct file *fp, struct proc *p) lf.l_len = 0; lf.l_type = F_UNLCK; vp = (struct vnode *)fp->f_data; - (void) VOP_ADVLOCK(vp, - (caddr_t)fdtol->fdl_leader, - F_UNLCK, &lf, F_POSIX); + VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, + F_UNLCK, &lf, F_POSIX); fdtol->fdl_holdcount--; if (fdtol->fdl_holdcount == 0 && fdtol->fdl_wakeup != 0) { @@ -2933,7 +2994,7 @@ fdrop(struct file *fp) lf.l_len = 0; lf.l_type = F_UNLCK; vp = (struct vnode *)fp->f_data; - (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0); + VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0); } if (fp->f_ops != &badfileops) error = fo_close(fp); diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index a8752b9888..1f0278632e 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -49,10 +49,12 @@ #include #include #include +#include #include #include #include +#include #define EVENT_REGISTER 1 #define EVENT_PROCESS 2 @@ -72,7 +74,7 @@ struct knote_cache_list { } __cachealign; static int kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, - struct knote *marker); + struct knote *marker, int closedcounter); static int kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags); static int kqueue_write(struct file *fp, struct uio *uio, @@ -395,6 +397,7 @@ filt_proc(struct knote *kn, long hint) if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { struct kevent kev; int error; + int n; /* * register knote with new process. @@ -405,7 +408,8 @@ filt_proc(struct knote *kn, long hint) kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata; /* preserve udata */ - error = kqueue_register(kn->kn_kq, &kev); + n = 1; + error = kqueue_register(kn->kn_kq, &kev, &n); if (error) kn->kn_fflags |= NOTE_TRACKERR; } @@ -797,8 +801,10 @@ kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap, struct kevent *kevp; struct timespec *tsp, ats; int i, n, total, error, nerrors = 0; + int gobbled; int lres; int limit = kq_checkloop; + int closedcounter; struct kevent kev[KQ_NEVENTS]; struct knote marker; struct lwkt_token *tok; @@ -809,6 +815,8 @@ kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap, tsp = tsp_in; *res = 0; + closedcounter = kq->kq_fdp->fd_closedcounter; + for (;;) { n = 0; error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n); @@ -816,10 +824,13 @@ kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap, return error; if (n == 0) break; - for (i = 0; i < n; i++) { + for (i = 0; i < n; ++i) + kev[i].flags &= ~EV_SYSFLAGS; + for (i = 0; i < n; ++i) { + gobbled = n - i; + error = kqueue_register(kq, &kev[i], &gobbled); + i += gobbled - 1; kevp = &kev[i]; - kevp->flags &= ~EV_SYSFLAGS; - error = kqueue_register(kq, kevp); /* * If a registration returns an error we @@ -970,7 +981,7 @@ kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap, * Process all received events * Account for all non-spurious events in our total */ - i = kqueue_scan(kq, kev, n, &marker); + i = kqueue_scan(kq, kev, n, &marker, closedcounter); if (i) { lres = *res; error = kevent_copyoutfn(uap, kev, i, res); @@ -1063,21 +1074,105 @@ sys_kevent(struct kevent_args *uap) return (error); } +/* + * Efficiently load multiple file pointers. This significantly reduces + * threaded overhead. When doing simple polling we can depend on the + * per-thread (fd,fp) cache. With more descriptors, we batch. + */ +static +void +floadkevfps(thread_t td, struct filedesc *fdp, struct kevent *kev, + struct file **fp, int climit) +{ + struct filterops *fops; + int tdcache; + + if (climit <= 2 && td->td_proc && td->td_proc->p_fd == fdp) { + tdcache = 1; + } else { + tdcache = 0; + spin_lock_shared(&fdp->fd_spin); + } + + while (climit) { + *fp = NULL; + if (kev->filter < 0 && + kev->filter + EVFILT_SYSCOUNT >= 0) { + fops = sysfilt_ops[~kev->filter]; + if (fops->f_flags & FILTEROP_ISFD) { + if (tdcache) { + *fp = holdfp(td, kev->ident, -1); + } else { + *fp = holdfp_fdp_locked(fdp, + kev->ident, -1); + } + } + } + --climit; + ++fp; + ++kev; + } + if (tdcache == 0) + spin_unlock_shared(&fdp->fd_spin); +} + +/* + * Register up to *countp kev's. Always registers at least 1. + * + * The number registered is returned in *countp. + * + * If an error occurs or a kev is flagged EV_RECEIPT, it is + * processed and included in *countp, and processing then + * stops. + */ int -kqueue_register(struct kqueue *kq, struct kevent *kev) +kqueue_register(struct kqueue *kq, struct kevent *kev, int *countp) { struct filedesc *fdp = kq->kq_fdp; struct klist *list = NULL; struct filterops *fops; - struct file *fp = NULL; + struct file *fp[KQ_NEVENTS]; struct knote *kn = NULL; struct thread *td; - int error = 0; + int error; + int count; + int climit; + int closedcounter; struct knote_cache_list *cache_list; + td = curthread; + climit = *countp; + if (climit > KQ_NEVENTS) + climit = KQ_NEVENTS; + closedcounter = fdp->fd_closedcounter; + floadkevfps(td, fdp, kev, fp, climit); + + lwkt_getpooltoken(kq); + count = 0; + + /* + * To avoid races, only one thread can register events on this + * kqueue at a time. + */ + while (__predict_false(kq->kq_regtd != NULL && kq->kq_regtd != td)) { + kq->kq_state |= KQ_REGWAIT; + tsleep(&kq->kq_regtd, 0, "kqreg", 0); + } + if (__predict_false(kq->kq_regtd != NULL)) { + /* Recursive calling of kqueue_register() */ + td = NULL; + } else { + /* Owner of the kq_regtd, i.e. td != NULL */ + kq->kq_regtd = td; + } + +loop: if (kev->filter < 0) { - if (kev->filter + EVFILT_SYSCOUNT < 0) - return (EINVAL); + if (kev->filter + EVFILT_SYSCOUNT < 0) { + error = EINVAL; + ++count; + goto done; + } fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ } else { /* @@ -1085,14 +1180,18 @@ kqueue_register(struct kqueue *kq, struct kevent *kev) * filter attach routine is responsible for insuring that * the identifier can be attached to it. */ - return (EINVAL); + error = EINVAL; + ++count; + goto done; } if (fops->f_flags & FILTEROP_ISFD) { /* validate descriptor */ - fp = holdfp_fdp(fdp, kev->ident, -1); - if (fp == NULL) - return (EBADF); + if (fp[count] == NULL) { + error = EBADF; + ++count; + goto done; + } } cache_list = &knote_cache_lists[mycpuid]; @@ -1106,32 +1205,11 @@ kqueue_register(struct kqueue *kq, struct kevent *kev) crit_exit(); } - td = curthread; - lwkt_getpooltoken(kq); - - /* - * Make sure that only one thread can register event on this kqueue, - * so that we would not suffer any race, even if the registration - * blocked, i.e. kq token was released, and the kqueue was shared - * between threads (this should be rare though). - */ - while (__predict_false(kq->kq_regtd != NULL && kq->kq_regtd != td)) { - kq->kq_state |= KQ_REGWAIT; - tsleep(&kq->kq_regtd, 0, "kqreg", 0); - } - if (__predict_false(kq->kq_regtd != NULL)) { - /* Recursive calling of kqueue_register() */ - td = NULL; - } else { - /* Owner of the kq_regtd, i.e. td != NULL */ - kq->kq_regtd = td; - } - - if (fp != NULL) { - list = &fp->f_klist; + if (fp[count] != NULL) { + list = &fp[count]->f_klist; } else if (kq->kq_knhashmask) { list = &kq->kq_knhash[ - KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; + KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; } if (list != NULL) { lwkt_getpooltoken(list); @@ -1154,6 +1232,7 @@ again: */ if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { error = ENOENT; + ++count; goto done; } @@ -1173,7 +1252,7 @@ again: cache_list->knote_cache_cnt--; crit_exit(); } - kn->kn_fp = fp; + kn->kn_fp = fp[count]; kn->kn_kq = kq; kn->kn_fop = fops; @@ -1181,7 +1260,7 @@ again: * apply reference count to knote structure, and * do not release it at the end of this routine. */ - fp = NULL; + fp[count] = NULL; /* safety */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; @@ -1199,6 +1278,7 @@ again: if ((error = filter_attach(kn)) != 0) { kn->kn_status |= KN_DELETING | KN_REPROCESS; knote_drop(kn); + ++count; goto done; } @@ -1209,7 +1289,8 @@ again: * want to end up with a knote on a closed descriptor. */ if ((fops->f_flags & FILTEROP_ISFD) && - checkfdclosed(fdp, kev->ident, kn->kn_fp)) { + checkfdclosed(curthread, fdp, kev->ident, kn->kn_fp, + closedcounter)) { kn->kn_status |= KN_DELETING | KN_REPROCESS; } } else { @@ -1244,6 +1325,8 @@ again: * Delete the existing knote */ knote_detach_and_drop(kn); + error = 0; + ++count; goto done; } else { /* @@ -1300,6 +1383,22 @@ again: knote_release(kn); /* kn may be invalid now */ + /* + * Loop control. We stop on errors (above), and also stop after + * processing EV_RECEIPT, so the caller can process it. + */ + ++count; + if (kev->flags & EV_RECEIPT) { + error = 0; + goto done; + } + ++kev; + if (count < climit) + goto loop; + + /* + * Cleanup + */ done: if (td != NULL) { /* Owner of the kq_regtd */ kq->kq_regtd = NULL; @@ -1309,8 +1408,13 @@ done: } } lwkt_relpooltoken(kq); - if (fp != NULL) - fdrop(fp); + + *countp = count; + while (count < climit) { + if (fp[count]) + fdrop(fp[count]); + ++count; + } return (error); } @@ -1323,9 +1427,10 @@ done: */ static int kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, - struct knote *marker) + struct knote *marker, int closedcounter) { struct knote *kn, local_marker; + thread_t td = curthread; int total; total = 0; @@ -1382,7 +1487,8 @@ kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, * to match up the event against a knote and will go haywire. */ if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && - checkfdclosed(kq->kq_fdp, kn->kn_kevent.ident, kn->kn_fp)) { + checkfdclosed(td, kq->kq_fdp, kn->kn_kevent.ident, + kn->kn_fp, closedcounter)) { kn->kn_status |= KN_DELETING | KN_REPROCESS; } diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index c9928242f0..fc2a314d64 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -390,8 +390,6 @@ exit1(int rv) */ semexit(p); - KKASSERT(p->p_numposixlocks == 0); - /* The next two chunks should probably be moved to vmspace_exit. */ vm = p->p_vmspace; @@ -1140,6 +1138,7 @@ loop: */ PHOLD(p); PRELEZOMB(p); + kfree(p->p_uidpcpu, M_SUBPROC); kfree(p, M_PROC); atomic_add_int(&nprocs, -1); error = 0; diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index ad66d2e424..6acb67be2e 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -466,6 +466,8 @@ fork1(struct lwp *lp1, int flags, struct proc **procp) spin_init(&p2->p_spin, "procfork1"); lwkt_token_init(&p2->p_token, "proc"); lwkt_gettoken(&p2->p_token); + p2->p_uidpcpu = kmalloc(sizeof(*p2->p_uidpcpu) * ncpus, + M_SUBPROC, M_WAITOK | M_ZERO); /* * Setup linkage for kernel based threading XXX lwp. Also add the diff --git a/sys/kern/kern_lockf.c b/sys/kern/kern_lockf.c index d7367dd36c..3bb8f4de9f 100644 --- a/sys/kern/kern_lockf.c +++ b/sys/kern/kern_lockf.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2004 Joerg Sonnenberger . All rights reserved. - * Copyright (c) 2006 Matthew Dillon . All rights reserved. + * Copyright (c) 2006-2018 Matthew Dillon . All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. @@ -127,18 +127,27 @@ void lf_count_adjust(struct proc *p, int increase) { struct uidinfo *uip; + struct uidcount *pup; + int n; KKASSERT(p != NULL); uip = p->p_ucred->cr_uidinfo; - if (increase) - atomic_add_int(&uip->ui_posixlocks, p->p_numposixlocks); - else - atomic_add_int(&uip->ui_posixlocks, -p->p_numposixlocks); + pup = &uip->ui_pcpu[mycpuid]; + + if (increase) { + for (n = 0; n < ncpus; ++n) + pup->pu_posixlocks += p->p_uidpcpu[n].pu_posixlocks; + } else { + for (n = 0; n < ncpus; ++n) + pup->pu_posixlocks -= p->p_uidpcpu[n].pu_posixlocks; + } - KASSERT(uip->ui_posixlocks >= 0, - ("Negative number of POSIX locks held by %s user: %d.", - increase ? "new" : "old", uip->ui_posixlocks)); + if (pup->pu_posixlocks < -PUP_LIMIT || + pup->pu_posixlocks > PUP_LIMIT) { + atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks); + pup->pu_posixlocks = 0; + } } static int @@ -160,14 +169,17 @@ lf_count_change(struct proc *owner, int diff) uip->ui_posixlocks >= max ) { ret = 1; } else { - atomic_add_int(&uip->ui_posixlocks, diff); - atomic_add_int(&owner->p_numposixlocks, diff); - KASSERT(uip->ui_posixlocks >= 0, - ("Negative number of POSIX locks held by user: %d.", - uip->ui_posixlocks)); - KASSERT(owner->p_numposixlocks >= 0, - ("Negative number of POSIX locks held by proc: %d.", - uip->ui_posixlocks)); + struct uidcount *pup; + int cpu = mycpuid; + + pup = &uip->ui_pcpu[cpu]; + pup->pu_posixlocks += diff; + if (pup->pu_posixlocks < -PUP_LIMIT || + pup->pu_posixlocks > PUP_LIMIT) { + atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks); + pup->pu_posixlocks = 0; + } + owner->p_uidpcpu[cpu].pu_posixlocks += diff; ret = 0; } return ret; diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c index ee678184cd..2262caf751 100644 --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -948,7 +948,7 @@ uicreate(uid_t uid) /* * Allocate space and check for a race */ - uip = kmalloc(sizeof(*uip), M_UIDINFO, M_WAITOK|M_ZERO); + uip = kmalloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO); /* * Initialize structure and enter it into the hash table @@ -957,6 +957,8 @@ uicreate(uid_t uid) uip->ui_uid = uid; uip->ui_ref = 1; /* we're returning a ref */ varsymset_init(&uip->ui_varsymset, NULL); + uip->ui_pcpu = kmalloc(sizeof(*uip->ui_pcpu) * ncpus, + M_UIDINFO, M_WAITOK | M_ZERO); /* * Somebody may have already created the uidinfo for this @@ -970,6 +972,7 @@ uicreate(uid_t uid) spin_uninit(&uip->ui_lock); varsymset_clean(&uip->ui_varsymset); + kfree(uip->ui_pcpu, M_UIDINFO); kfree(uip, M_UIDINFO); uip = tmp; } else { @@ -1048,6 +1051,7 @@ uifree(uid_t uid) varsymset_clean(&uip->ui_varsymset); lockuninit(&uip->ui_varsymset.vx_lock); spin_uninit(&uip->ui_lock); + kfree(uip->ui_pcpu, M_UIDINFO); kfree(uip, M_UIDINFO); } else { spin_unlock(&uihash_lock); diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index b2f6258d5e..ca5a55606b 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -984,7 +984,8 @@ select_copyout(void *arg, struct kevent *kevp, int count, int *res) { struct select_kevent_copyin_args *skap; struct kevent kev; - int i = 0; + int i; + int n; skap = (struct select_kevent_copyin_args *)arg; @@ -996,7 +997,8 @@ select_copyout(void *arg, struct kevent *kevp, int count, int *res) skap->lwp->lwp_kqueue_serial) { kev = kevp[i]; kev.flags = EV_DISABLE|EV_DELETE; - kqueue_register(&skap->lwp->lwp_kqueue, &kev); + n = 1; + kqueue_register(&skap->lwp->lwp_kqueue, &kev, &n); if (nseldebug) { kprintf("select fd %ju mismatched serial %ju\n", (uintmax_t)kevp[i].ident, @@ -1376,6 +1378,7 @@ poll_copyout(void *arg, struct kevent *kevp, int count, int *res) struct kevent kev; int count_res; int i; + int n; uint64_t pi; pkap = (struct poll_kevent_copyin_args *)arg; @@ -1392,7 +1395,8 @@ poll_copyout(void *arg, struct kevent *kevp, int count, int *res) if (pi >= pkap->nfds) { kev = kevp[i]; kev.flags = EV_DISABLE|EV_DELETE; - kqueue_register(&pkap->lwp->lwp_kqueue, &kev); + n = 1; + kqueue_register(&pkap->lwp->lwp_kqueue, &kev, &n); if (nseldebug) { kprintf("poll index %ju out of range against " "serial %ju\n", (uintmax_t)pi, @@ -1602,6 +1606,7 @@ socket_wait(struct socket *so, struct timespec *ts, int *res) struct kqueue kq; struct kevent kev; int error, fd; + int n; if ((error = falloc(td->td_lwp, &fp, &fd)) != 0) return (error); @@ -1616,7 +1621,8 @@ socket_wait(struct socket *so, struct timespec *ts, int *res) bzero(&kq, sizeof(kq)); kqueue_init(&kq, td->td_lwp->lwp_proc->p_fd); EV_SET(&kev, fd, EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, NULL); - if ((error = kqueue_register(&kq, &kev)) != 0) { + n = 1; + if ((error = kqueue_register(&kq, &kev, &n)) != 0) { fdrop(fp); return (error); } @@ -1625,7 +1631,8 @@ socket_wait(struct socket *so, struct timespec *ts, int *res) socket_wait_copyout, ts, 0); EV_SET(&kev, fd, EVFILT_READ, EV_DELETE|EV_DISABLE, 0, 0, NULL); - kqueue_register(&kq, &kev); + n = 1; + kqueue_register(&kq, &kev, &n); fp->f_ops = &badfileops; fdrop(fp); diff --git a/sys/sys/event.h b/sys/sys/event.h index 501eb6f3c4..71aa7e9ba1 100644 --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -253,7 +253,8 @@ extern void knote_assume_knotes(struct kqinfo *, struct kqinfo *, extern void knote_fdclose(struct file *fp, struct filedesc *fdp, int fd); extern void kqueue_init(struct kqueue *kq, struct filedesc *fdp); extern void kqueue_terminate(struct kqueue *kq); -extern int kqueue_register(struct kqueue *kq, struct kevent *kev); +extern int kqueue_register(struct kqueue *kq, struct kevent *kev, + int *countp); extern struct klist fs_klist; /* EVFILT_FS */ diff --git a/sys/sys/eventvar.h b/sys/sys/eventvar.h index c74f31c344..3c4d996c60 100644 --- a/sys/sys/eventvar.h +++ b/sys/sys/eventvar.h @@ -50,7 +50,7 @@ #endif -#define KQ_NEVENTS 8 /* minimize copy{in,out} calls */ +#define KQ_NEVENTS 32 /* limit stack use */ #define KQEXTENT 256 /* linear growth by this amount */ TAILQ_HEAD(kqlist, knote); diff --git a/sys/sys/file.h b/sys/sys/file.h index 8df95a5d99..f3b396492b 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -141,7 +141,8 @@ MALLOC_DECLARE(M_FILE); extern void fhold(struct file *fp); extern int fdrop (struct file *fp); -extern int checkfdclosed(struct filedesc *fdp, int fd, struct file *fp); +extern int checkfdclosed(thread_t td, struct filedesc *fdp, int fd, + struct file *fp, int closedcounter); extern int fp_open(const char *path, int flags, int mode, struct file **fpp); extern int fp_vpopen(struct vnode *vp, int flags, struct file **fpp); extern int fp_pread(struct file *fp, void *buf, size_t nbytes, off_t offset, ssize_t *res, enum uio_seg); diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h index 65d044ad49..c2a685bc82 100644 --- a/sys/sys/filedesc.h +++ b/sys/sys/filedesc.h @@ -93,6 +93,7 @@ struct filedesc { int fd_softrefs; /* softrefs to prevent destruction */ int fd_holdleaderscount; /* block fdfree() for shared close() */ int fd_holdleaderswakeup; /* fdfree() needs wakeup */ + int fd_closedcounter; /* detect close() */ struct spinlock fd_spin; struct fdnode fd_builtin_files[NDFILE]; }; @@ -171,6 +172,7 @@ void fdcloseexec (struct proc *p); int fdcheckstd (struct lwp *lp); struct file *holdfp (struct thread *td, int fd, int flag); struct file *holdfp_fdp (struct filedesc *fdp, int fd, int flag); +struct file *holdfp_fdp_locked (struct filedesc *fdp, int fd, int flag); int holdsock (struct thread *td, int fdes, struct file **fpp); int holdvnode (struct thread *td, int fd, struct file **fpp); void dropfp(struct thread *td, int fd, struct file *fp); diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 810a09adc1..695cf9d38e 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -73,6 +73,7 @@ struct proc; struct pgrp; struct session; struct lwp; +struct uidcount; LIST_HEAD(proclist, proc); LIST_HEAD(pgrplist, pgrp); @@ -250,7 +251,8 @@ struct proc { int p_flags; /* P_* flags. */ enum procstat p_stat; /* S* process status. */ - char p_pad1[3]; + char p_advlock_flag; /* replaces P_ADVLOCK */ + char p_pad1[2]; pid_t p_pid; /* Process identifier. */ pid_t p_ppid; /* Current parent pid */ @@ -331,7 +333,7 @@ struct proc { void *p_emuldata; /* process-specific emulator state */ struct usched *p_usched; /* Userland scheduling control */ struct vkernel_proc *p_vkernel; /* VKernel support, proc part */ - int p_numposixlocks; /* number of POSIX locks */ + struct uidcount *p_uidpcpu; void (*p_userret)(void);/* p: return-to-user hook */ struct spinlock p_spin; /* Spinlock for LWP access to proc */ @@ -353,7 +355,7 @@ struct proc { #define p_pgid p_pgrp->pg_id /* These flags are kept in p_flags. */ -#define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock */ +#define P_UNUSED01 0x00001 #define P_CONTROLT 0x00002 /* Has a controlling terminal */ #define P_SWAPPEDOUT 0x00004 /* Swapped out of memory */ #define P_SYSVSEM 0x00008 /* Might have SysV semaphores */ diff --git a/sys/sys/resourcevar.h b/sys/sys/resourcevar.h index d83293698c..a3957b6a17 100644 --- a/sys/sys/resourcevar.h +++ b/sys/sys/resourcevar.h @@ -102,6 +102,20 @@ struct plimit { #define PLIMIT_TESTCPU_XCPU 1 #define PLIMIT_TESTCPU_KILL 2 +/* + * Per-cpu tracking structure attached to uidinfo. These counts are only + * synchronized with the uidinfo rollup fields at +/-32. Resource limits + * only check against the ui_posixlocks and ui_openfiles so some slop + * is possible (checking against the pcpu structures would be cause cache + * line ping-ponging) + */ +struct uidcount { + int pu_posixlocks; + int pu_openfiles; +} __cachealign; + +#define PUP_LIMIT 32 /* +/-32 rollup */ + /* * Per uid resource consumption */ @@ -115,9 +129,10 @@ struct uidinfo { long ui_proccnt; /* number of processes */ uid_t ui_uid; /* uid */ int ui_ref; /* reference count */ - int ui_posixlocks; /* number of POSIX locks */ - int ui_openfiles; /* number of open files */ + int ui_posixlocks; /* (rollup) number of POSIX locks */ + int ui_openfiles; /* (rollup) number of open files */ struct varsymset ui_varsymset; /* variant symlinks */ + struct uidcount *ui_pcpu; }; #endif -- 2.41.0