From cf9f4e88a67efbe26053ec74dc81d4f8e6c5bafc Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 21 Aug 2010 18:41:15 -0700 Subject: [PATCH] kernel - close numerous kqueue MP and blocking races * create race Close a race where one thread registers a kevent and blocks in f_attach allowing another thread to register a change or deletion on the same event. * delete race Close a race where one thread deletes a kevent and blocks in f_detach allowing another thread to register a change or deletion on the same event. * scan/close race Close a race where one thread is in the middle of a kqueue_scan() and another concurrently close()s a related file descriptor. This could cause select and poll to loop forever due to not being able to delete the 'spurious' kevent. * register/close race Close a race where one thread registers a kevent and blocks in f_attach and another thread close()s the related file descriptor at the same time. --- sys/kern/kern_descrip.c | 12 +++-- sys/kern/kern_event.c | 112 ++++++++++++++++++++++++++++++---------- sys/sys/event.h | 2 + sys/sys/file.h | 1 + 4 files changed, 95 insertions(+), 32 deletions(-) diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 5941b6b3de..508d64fe45 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -110,7 +110,6 @@ static void fsetfd_locked(struct filedesc *fdp, struct file *fp, int fd); static void fdreserve_locked (struct filedesc *fdp, int fd0, int incr); static struct file *funsetfd_locked (struct filedesc *fdp, int fd); -static int checkfpclosed(struct filedesc *fdp, int fd, struct file *fp); static void ffree(struct file *fp); static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table"); @@ -354,7 +353,7 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred) * we were blocked getting the lock. If this occurs the * close might not have caught the lock. */ - if (checkfpclosed(p->p_fd, fd, fp)) { + if (checkfdclosed(p->p_fd, fd, fp)) { dat->fc_flock.l_whence = SEEK_SET; dat->fc_flock.l_start = 0; dat->fc_flock.l_len = 0; @@ -1494,16 +1493,19 @@ done: } /* + * Check for races against a file descriptor by determining that the + * file pointer is still associated with the specified file descriptor, + * and a close is not currently in progress. + * * MPSAFE */ -static int -checkfpclosed(struct filedesc *fdp, int fd, struct file *fp) +checkfdclosed(struct filedesc *fdp, int fd, struct file *fp) { int error; spin_lock_rd(&fdp->fd_spin); - if ((unsigned) fd >= fdp->fd_nfiles || fp != fdp->fd_files[fd].fp) + if ((unsigned)fd >= fdp->fd_nfiles || fp != fdp->fd_files[fd].fp) error = EBADF; else error = 0; diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index bef6ec7060..16b0a938dd 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -84,7 +84,6 @@ static int kqueue_stat(struct file *fp, struct stat *st, static int kqueue_close(struct file *fp); static void kqueue_wakeup(struct kqueue *kq); static int filter_attach(struct knote *kn); -static void filter_detach(struct knote *kn); static int filter_event(struct knote *kn, long hint); /* @@ -102,6 +101,7 @@ static struct fileops kqueueops = { static void knote_attach(struct knote *kn); static void knote_drop(struct knote *kn); +static void knote_detach_and_drop(struct knote *kn); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static void knote_init(void); @@ -422,10 +422,8 @@ kqueue_terminate(struct kqueue *kq) { struct knote *kn; - while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) { - filter_detach(kn); - knote_drop(kn); - } + while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) + knote_detach_and_drop(kn); if (kq->kq_knhash) { kfree(kq->kq_knhash, M_KQUEUE); @@ -798,11 +796,29 @@ kqueue_register(struct kqueue *kq, struct kevent *kev) kev->data = 0; kn->kn_kevent = *kev; + /* + * Interlock against creation/deletion races due + * to f_attach() blocking. knote_attach() will set + * KN_CREATING. + */ knote_attach(kn); if ((error = filter_attach(kn)) != 0) { + kn->kn_status |= KN_DELETING; knote_drop(kn); goto done; } + kn->kn_status &= ~KN_CREATING; + + /* + * Interlock against close races which remove our + * knotes. We do not want to end up with a knote + * on a closed descriptor. + */ + if ((fops->f_flags & FILTEROP_ISFD) && + (error = checkfdclosed(fdp, kev->ident, kn->kn_fp)) != 0) { + knote_detach_and_drop(kn); + goto done; + } } else { /* * The user may change some filter values after the @@ -816,10 +832,8 @@ kqueue_register(struct kqueue *kq, struct kevent *kev) if (filter_event(kn, 0)) KNOTE_ACTIVATE(kn); - } else if (kev->flags & EV_DELETE) { - filter_detach(kn); - knote_drop(kn); + knote_detach_and_drop(kn); goto done; } @@ -917,15 +931,44 @@ kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe); kq->kq_count--; - if (kn->kn_status & KN_DISABLED) { - kn->kn_status &= ~KN_QUEUED; + kn->kn_status &= ~KN_QUEUED; + + /* + * Even though close/dup2 will clean out pending knotes this + * code is MPSAFE and it is possible to race a close inbetween + * the removal of its descriptor and the clearing out of the + * knote(s). + * + * In this case we must ensure that the knote is not queued + * to knpend or we risk an infinite kernel loop calling + * kscan, because the select/poll code will not be able to + * delete the event. + */ + if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && + checkfdclosed(kq->kq_fdp, kn->kn_kevent.ident, kn->kn_fp)) { + kn->kn_status &= ~KN_ACTIVE; continue; } + + /* + * If disabled we ensure the event is not queued but leave + * its active bit set. On re-enablement the event may be + * immediately triggered. + */ + if (kn->kn_status & KN_DISABLED) + continue; + + /* + * If not running in one-shot mode and the event is no + * longer present we ensure it is removed from the queue and + * ignore it. + */ if ((kn->kn_flags & EV_ONESHOT) == 0 && filter_event(kn, 0) == 0) { - kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); + kn->kn_status &= ~KN_ACTIVE; continue; } + *kevp++ = kn->kn_kevent; ++total; --count; @@ -934,16 +977,15 @@ kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count, * Post-event action on the note */ if (kn->kn_flags & EV_ONESHOT) { - kn->kn_status &= ~KN_QUEUED; - filter_detach(kn); - knote_drop(kn); + knote_detach_and_drop(kn); } else if (kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; - kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); + kn->kn_status &= ~KN_ACTIVE; } else { TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe); kq->kq_count++; + kn->kn_status |= KN_QUEUED; } } TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe); @@ -1070,30 +1112,48 @@ filter_attach(struct knote *kn) } /* + * Detach the knote and drop it, destroying the knote. + * * Calls filterops f_detach function, acquiring mplock if filter is not * marked as FILTEROP_MPSAFE. + * + * This can race due to the MP lock and/or locks acquired by f_detach, + * so we interlock with KN_DELETING. It is also possible to race + * a create for the same reason if userland tries to delete the knote + * before the create is complete. */ static void -filter_detach(struct knote *kn) +knote_detach_and_drop(struct knote *kn) { - if (!(kn->kn_fop->f_flags & FILTEROP_MPSAFE)) { - get_mplock(); + if (kn->kn_status & (KN_CREATING | KN_DELETING)) + return; + kn->kn_status |= KN_DELETING; + + if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { kn->kn_fop->f_detach(kn); - rel_mplock(); } else { + get_mplock(); kn->kn_fop->f_detach(kn); + rel_mplock(); } + knote_drop(kn); } /* * Calls filterops f_event function, acquiring mplock if filter is not * marked as FILTEROP_MPSAFE. + * + * If the knote is in the middle of being created or deleted we cannot + * safely call the filter op. */ static int filter_event(struct knote *kn, long hint) { int ret; + if (kn->kn_status & (KN_CREATING | KN_DELETING)) + return(0); + if (!(kn->kn_fop->f_flags & FILTEROP_MPSAFE)) { get_mplock(); ret = kn->kn_fop->f_event(kn, hint); @@ -1114,9 +1174,10 @@ knote(struct klist *list, long hint) struct knote *kn; lwkt_gettoken(&kq_token); - SLIST_FOREACH(kn, list, kn_next) + SLIST_FOREACH(kn, list, kn_next) { if (filter_event(kn, hint)) KNOTE_ACTIVATE(kn); + } lwkt_reltoken(&kq_token); } @@ -1151,10 +1212,8 @@ knote_empty(struct klist *list) struct knote *kn; lwkt_gettoken(&kq_token); - while ((kn = SLIST_FIRST(list)) != NULL) { - filter_detach(kn); - knote_drop(kn); - } + while ((kn = SLIST_FIRST(list)) != NULL) + knote_detach_and_drop(kn); lwkt_reltoken(&kq_token); } @@ -1170,8 +1229,7 @@ knote_fdclose(struct file *fp, struct filedesc *fdp, int fd) restart: SLIST_FOREACH(kn, &fp->f_klist, kn_link) { if (kn->kn_kq->kq_fdp == fdp && kn->kn_id == fd) { - filter_detach(kn); - knote_drop(kn); + knote_detach_and_drop(kn); goto restart; } } @@ -1195,7 +1253,7 @@ knote_attach(struct knote *kn) } SLIST_INSERT_HEAD(list, kn, kn_link); TAILQ_INSERT_HEAD(&kq->kq_knlist, kn, kn_kqlink); - kn->kn_status = 0; + kn->kn_status = KN_CREATING; } static void diff --git a/sys/sys/event.h b/sys/sys/event.h index f0d20b7411..9a0d7591b2 100644 --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -194,6 +194,8 @@ struct knote { #define KN_QUEUED 0x02 /* event is on queue */ #define KN_DISABLED 0x04 /* event is disabled */ #define KN_DETACHED 0x08 /* knote is detached */ +#define KN_CREATING 0x10 /* creation in progress */ +#define KN_DELETING 0x20 /* deletion in progress */ #define kn_id kn_kevent.ident #define kn_filter kn_kevent.filter diff --git a/sys/sys/file.h b/sys/sys/file.h index ceecf6e6c7..cd3c8f6221 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -147,6 +147,7 @@ MALLOC_DECLARE(M_FILE); extern void fhold(struct file *fp); extern int fdrop (struct file *fp); +extern int checkfdclosed(struct filedesc *fdp, int fd, struct file *fp); extern int fp_open(const char *path, int flags, int mode, struct file **fpp); extern int fp_vpopen(struct vnode *vp, int flags, struct file **fpp); extern int fp_pread(struct file *fp, void *buf, size_t nbytes, off_t offset, ssize_t *res, enum uio_seg); -- 2.41.0