Merge from vendor branch GDB:
[dragonfly.git] / lib / libc_r / uthread / uthread_kern.c
1 /*
2  * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by John Birrell.
16  * 4. Neither the name of the author nor the names of any co-contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  * $FreeBSD: src/lib/libc_r/uthread/uthread_kern.c,v 1.28.2.13 2002/10/22 14:44:03 fjoe Exp $
33  * $DragonFly: src/lib/libc_r/uthread/uthread_kern.c,v 1.2 2003/06/17 04:26:48 dillon Exp $
34  *
35  */
36 #include <errno.h>
37 #include <poll.h>
38 #include <stdlib.h>
39 #include <stdarg.h>
40 #include <string.h>
41 #include <unistd.h>
42 #include <setjmp.h>
43 #include <sys/param.h>
44 #include <sys/types.h>
45 #include <sys/signalvar.h>
46 #include <sys/stat.h>
47 #include <sys/time.h>
48 #include <sys/socket.h>
49 #include <sys/uio.h>
50 #include <sys/syscall.h>
51 #include <fcntl.h>
52 #include <pthread.h>
53 #include "pthread_private.h"
54
55 /* #define DEBUG_THREAD_KERN */
56 #ifdef DEBUG_THREAD_KERN
57 #define DBG_MSG         stdout_debug
58 #else
59 #define DBG_MSG(x...)
60 #endif
61
62 /* Static function prototype definitions: */
63 static void
64 thread_kern_poll(int wait_reqd);
65
66 static void
67 dequeue_signals(void);
68
69 static inline void
70 thread_run_switch_hook(pthread_t thread_out, pthread_t thread_in);
71
72 /* Static variables: */
73 static int      last_tick = 0;
74 static int      called_from_handler = 0;
75
76 /*
77  * This is called when a signal handler finishes and wants to
78  * return to a previous frame.
79  */
80 void
81 _thread_kern_sched_frame(struct pthread_signal_frame *psf)
82 {
83         struct pthread  *curthread = _get_curthread();
84
85         /*
86          * Flag the pthread kernel as executing scheduler code
87          * to avoid a signal from interrupting this execution and
88          * corrupting the (soon-to-be) current frame.
89          */
90         _thread_kern_in_sched = 1;
91
92         /* Restore the signal frame: */
93         _thread_sigframe_restore(curthread, psf);
94
95         /* The signal mask was restored; check for any pending signals: */
96         curthread->check_pending = 1;
97
98         /* Switch to the thread scheduler: */
99         ___longjmp(_thread_kern_sched_jb, 1);
100 }
101
102
103 void
104 _thread_kern_sched(ucontext_t *ucp)
105 {
106         struct pthread  *curthread = _get_curthread();
107
108         /*
109          * Flag the pthread kernel as executing scheduler code
110          * to avoid a scheduler signal from interrupting this
111          * execution and calling the scheduler again.
112          */
113         _thread_kern_in_sched = 1;
114
115         /* Check if this function was called from the signal handler: */
116         if (ucp != NULL) {
117                 /* XXX - Save FP registers? */
118                 FP_SAVE_UC(ucp);
119                 called_from_handler = 1;
120                 DBG_MSG("Entering scheduler due to signal\n");
121         }
122
123         /* Save the state of the current thread: */
124         if (_setjmp(curthread->ctx.jb) != 0) {
125                 DBG_MSG("Returned from ___longjmp, thread %p\n",
126                     curthread);
127                 /*
128                  * This point is reached when a longjmp() is called
129                  * to restore the state of a thread.
130                  *
131                  * This is the normal way out of the scheduler.
132                  */
133                 _thread_kern_in_sched = 0;
134
135                 if (curthread->sig_defer_count == 0) {
136                         if (((curthread->cancelflags &
137                             PTHREAD_AT_CANCEL_POINT) == 0) &&
138                             ((curthread->cancelflags &
139                             PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
140                                 /*
141                                  * Cancellations override signals.
142                                  *
143                                  * Stick a cancellation point at the
144                                  * start of each async-cancellable
145                                  * thread's resumption.
146                                  *
147                                  * We allow threads woken at cancel
148                                  * points to do their own checks.
149                                  */
150                                 pthread_testcancel();
151                 }
152
153                 if (_sched_switch_hook != NULL) {
154                         /* Run the installed switch hook: */
155                         thread_run_switch_hook(_last_user_thread, curthread);
156                 }
157                 if (ucp == NULL)
158                         return;
159                 else {
160                         /* XXX - Restore FP registers? */
161                         FP_RESTORE_UC(ucp);
162
163                         /*
164                          * Set the process signal mask in the context; it
165                          * could have changed by the handler.
166                          */
167                         ucp->uc_sigmask = _process_sigmask;
168
169                         /* Resume the interrupted thread: */
170                         sigreturn(ucp);
171                 }
172         }
173         /* Switch to the thread scheduler: */
174         ___longjmp(_thread_kern_sched_jb, 1);
175 }
176
177 void
178 _thread_kern_sched_sig(void)
179 {
180         struct pthread  *curthread = _get_curthread();
181
182         curthread->check_pending = 1;
183         _thread_kern_sched(NULL);
184 }
185
186
187 void
188 _thread_kern_scheduler(void)
189 {
190         struct timespec ts;
191         struct timeval  tv;
192         struct pthread  *curthread = _get_curthread();
193         pthread_t       pthread, pthread_h;
194         unsigned int    current_tick;
195         int             add_to_prioq;
196
197         /* If the currently running thread is a user thread, save it: */
198         if ((curthread->flags & PTHREAD_FLAGS_PRIVATE) == 0)
199                 _last_user_thread = curthread;
200
201         if (called_from_handler != 0) {
202                 called_from_handler = 0;
203
204                 /*
205                  * We were called from a signal handler; restore the process
206                  * signal mask.
207                  */
208                 if (__sys_sigprocmask(SIG_SETMASK,
209                     &_process_sigmask, NULL) != 0)
210                         PANIC("Unable to restore process mask after signal");
211         }
212
213         /*
214          * Enter a scheduling loop that finds the next thread that is
215          * ready to run. This loop completes when there are no more threads
216          * in the global list or when a thread has its state restored by
217          * either a sigreturn (if the state was saved as a sigcontext) or a
218          * longjmp (if the state was saved by a setjmp).
219          */
220         while (!(TAILQ_EMPTY(&_thread_list))) {
221                 /* Get the current time of day: */
222                 GET_CURRENT_TOD(tv);
223                 TIMEVAL_TO_TIMESPEC(&tv, &ts);
224                 current_tick = _sched_ticks;
225
226                 /*
227                  * Protect the scheduling queues from access by the signal
228                  * handler.
229                  */
230                 _queue_signals = 1;
231                 add_to_prioq = 0;
232
233                 if (curthread != &_thread_kern_thread) {
234                         /*
235                          * This thread no longer needs to yield the CPU.
236                          */
237                         curthread->yield_on_sig_undefer = 0;
238         
239                         if (curthread->state != PS_RUNNING) {
240                                 /*
241                                  * Save the current time as the time that the
242                                  * thread became inactive:
243                                  */
244                                 curthread->last_inactive = (long)current_tick;
245                                 if (curthread->last_inactive <
246                                     curthread->last_active) {
247                                         /* Account for a rollover: */
248                                         curthread->last_inactive =+
249                                             UINT_MAX + 1;
250                                 }
251                         }
252
253                         /*
254                          * Place the currently running thread into the
255                          * appropriate queue(s).
256                          */
257                         switch (curthread->state) {
258                         case PS_DEAD:
259                         case PS_STATE_MAX: /* to silence -Wall */
260                         case PS_SUSPENDED:
261                                 /*
262                                  * Dead and suspended threads are not placed
263                                  * in any queue:
264                                  */
265                                 break;
266
267                         case PS_RUNNING:
268                                 /*
269                                  * Runnable threads can't be placed in the
270                                  * priority queue until after waiting threads
271                                  * are polled (to preserve round-robin
272                                  * scheduling).
273                                  */
274                                 add_to_prioq = 1;
275                                 break;
276
277                         /*
278                          * States which do not depend on file descriptor I/O
279                          * operations or timeouts:
280                          */
281                         case PS_DEADLOCK:
282                         case PS_FDLR_WAIT:
283                         case PS_FDLW_WAIT:
284                         case PS_FILE_WAIT:
285                         case PS_JOIN:
286                         case PS_MUTEX_WAIT:
287                         case PS_SIGSUSPEND:
288                         case PS_SIGTHREAD:
289                         case PS_SIGWAIT:
290                         case PS_WAIT_WAIT:
291                                 /* No timeouts for these states: */
292                                 curthread->wakeup_time.tv_sec = -1;
293                                 curthread->wakeup_time.tv_nsec = -1;
294
295                                 /* Restart the time slice: */
296                                 curthread->slice_usec = -1;
297
298                                 /* Insert into the waiting queue: */
299                                 PTHREAD_WAITQ_INSERT(curthread);
300                                 break;
301
302                         /* States which can timeout: */
303                         case PS_COND_WAIT:
304                         case PS_SLEEP_WAIT:
305                                 /* Restart the time slice: */
306                                 curthread->slice_usec = -1;
307
308                                 /* Insert into the waiting queue: */
309                                 PTHREAD_WAITQ_INSERT(curthread);
310                                 break;
311         
312                         /* States that require periodic work: */
313                         case PS_SPINBLOCK:
314                                 /* No timeouts for this state: */
315                                 curthread->wakeup_time.tv_sec = -1;
316                                 curthread->wakeup_time.tv_nsec = -1;
317
318                                 /* Increment spinblock count: */
319                                 _spinblock_count++;
320
321                                 /* FALLTHROUGH */
322                         case PS_FDR_WAIT:
323                         case PS_FDW_WAIT:
324                         case PS_POLL_WAIT:
325                         case PS_SELECT_WAIT:
326                                 /* Restart the time slice: */
327                                 curthread->slice_usec = -1;
328         
329                                 /* Insert into the waiting queue: */
330                                 PTHREAD_WAITQ_INSERT(curthread);
331         
332                                 /* Insert into the work queue: */
333                                 PTHREAD_WORKQ_INSERT(curthread);
334                                 break;
335                         }
336
337                         /*
338                          * Are there pending signals for this thread?
339                          *
340                          * This check has to be performed after the thread
341                          * has been placed in the queue(s) appropriate for
342                          * its state.  The process of adding pending signals
343                          * can change a threads state, which in turn will
344                          * attempt to add or remove the thread from any
345                          * scheduling queue to which it belongs.
346                          */
347                         if (curthread->check_pending != 0) {
348                                 curthread->check_pending = 0;
349                                 _thread_sig_check_pending(curthread);
350                         }
351                 }
352
353                 /*
354                  * Avoid polling file descriptors if there are none
355                  * waiting:
356                  */
357                 if (TAILQ_EMPTY(&_workq) != 0) {
358                 }
359                 /*
360                  * Poll file descriptors only if a new scheduling signal
361                  * has occurred or if we have no more runnable threads.
362                  */
363                 else if (((current_tick = _sched_ticks) != last_tick) ||
364                     ((curthread->state != PS_RUNNING) &&
365                     (PTHREAD_PRIOQ_FIRST() == NULL))) {
366                         /* Unprotect the scheduling queues: */
367                         _queue_signals = 0;
368
369                         /*
370                          * Poll file descriptors to update the state of threads
371                          * waiting on file I/O where data may be available:
372                          */
373                         thread_kern_poll(0);
374
375                         /* Protect the scheduling queues: */
376                         _queue_signals = 1;
377                 }
378                 last_tick = current_tick;
379
380                 /*
381                  * Wake up threads that have timedout.  This has to be
382                  * done after polling in case a thread does a poll or
383                  * select with zero time.
384                  */
385                 PTHREAD_WAITQ_SETACTIVE();
386                 while (((pthread = TAILQ_FIRST(&_waitingq)) != NULL) &&
387                     (pthread->wakeup_time.tv_sec != -1) &&
388                     (((pthread->wakeup_time.tv_sec == 0) &&
389                     (pthread->wakeup_time.tv_nsec == 0)) ||
390                     (pthread->wakeup_time.tv_sec < ts.tv_sec) ||
391                     ((pthread->wakeup_time.tv_sec == ts.tv_sec) &&
392                     (pthread->wakeup_time.tv_nsec <= ts.tv_nsec)))) {
393                         switch (pthread->state) {
394                         case PS_POLL_WAIT:
395                         case PS_SELECT_WAIT:
396                                 /* Return zero file descriptors ready: */
397                                 pthread->data.poll_data->nfds = 0;
398                                 /* FALLTHROUGH */
399                         default:
400                                 /*
401                                  * Remove this thread from the waiting queue
402                                  * (and work queue if necessary) and place it
403                                  * in the ready queue.
404                                  */
405                                 PTHREAD_WAITQ_CLEARACTIVE();
406                                 if (pthread->flags & PTHREAD_FLAGS_IN_WORKQ)
407                                         PTHREAD_WORKQ_REMOVE(pthread);
408                                 PTHREAD_NEW_STATE(pthread, PS_RUNNING);
409                                 PTHREAD_WAITQ_SETACTIVE();
410                                 break;
411                         }
412                         /*
413                          * Flag the timeout in the thread structure:
414                          */
415                         pthread->timeout = 1;
416                 }
417                 PTHREAD_WAITQ_CLEARACTIVE();
418
419                 /*
420                  * Check to see if the current thread needs to be added
421                  * to the priority queue:
422                  */
423                 if (add_to_prioq != 0) {
424                         /*
425                          * Save the current time as the time that the
426                          * thread became inactive:
427                          */
428                         current_tick = _sched_ticks;
429                         curthread->last_inactive = (long)current_tick;
430                         if (curthread->last_inactive <
431                             curthread->last_active) {
432                                 /* Account for a rollover: */
433                                 curthread->last_inactive =+ UINT_MAX + 1;
434                         }
435
436                         if ((curthread->slice_usec != -1) &&
437                            (curthread->attr.sched_policy != SCHED_FIFO)) {
438                                 /*
439                                  * Accumulate the number of microseconds for
440                                  * which the current thread has run:
441                                  */
442                                 curthread->slice_usec +=
443                                     (curthread->last_inactive -
444                                     curthread->last_active) *
445                                     (long)_clock_res_usec;
446                                 /* Check for time quantum exceeded: */
447                                 if (curthread->slice_usec > TIMESLICE_USEC)
448                                         curthread->slice_usec = -1;
449                         }
450
451                         if (curthread->slice_usec == -1) {
452                                 /*
453                                  * The thread exceeded its time
454                                  * quantum or it yielded the CPU;
455                                  * place it at the tail of the
456                                  * queue for its priority.
457                                  */
458                                 PTHREAD_PRIOQ_INSERT_TAIL(curthread);
459                         } else {
460                                 /*
461                                  * The thread hasn't exceeded its
462                                  * interval.  Place it at the head
463                                  * of the queue for its priority.
464                                  */
465                                 PTHREAD_PRIOQ_INSERT_HEAD(curthread);
466                         }
467                 }
468
469                 /*
470                  * Get the highest priority thread in the ready queue.
471                  */
472                 pthread_h = PTHREAD_PRIOQ_FIRST();
473
474                 /* Check if there are no threads ready to run: */
475                 if (pthread_h == NULL) {
476                         /*
477                          * Lock the pthread kernel by changing the pointer to
478                          * the running thread to point to the global kernel
479                          * thread structure:
480                          */
481                         _set_curthread(&_thread_kern_thread);
482                         curthread = &_thread_kern_thread;
483
484                         DBG_MSG("No runnable threads, using kernel thread %p\n",
485                             curthread);
486
487                         /* Unprotect the scheduling queues: */
488                         _queue_signals = 0;
489
490                         /*
491                          * There are no threads ready to run, so wait until
492                          * something happens that changes this condition:
493                          */
494                         thread_kern_poll(1);
495
496                         /*
497                          * This process' usage will likely be very small
498                          * while waiting in a poll.  Since the scheduling
499                          * clock is based on the profiling timer, it is
500                          * unlikely that the profiling timer will fire
501                          * and update the time of day.  To account for this,
502                          * get the time of day after polling with a timeout.
503                          */
504                         gettimeofday((struct timeval *) &_sched_tod, NULL);
505                         
506                         /* Check once more for a runnable thread: */
507                         _queue_signals = 1;
508                         pthread_h = PTHREAD_PRIOQ_FIRST();
509                         _queue_signals = 0;
510                 }
511
512                 if (pthread_h != NULL) {
513                         /* Remove the thread from the ready queue: */
514                         PTHREAD_PRIOQ_REMOVE(pthread_h);
515
516                         /* Unprotect the scheduling queues: */
517                         _queue_signals = 0;
518
519                         /*
520                          * Check for signals queued while the scheduling
521                          * queues were protected:
522                          */
523                         while (_sigq_check_reqd != 0) {
524                                 /* Clear before handling queued signals: */
525                                 _sigq_check_reqd = 0;
526
527                                 /* Protect the scheduling queues again: */
528                                 _queue_signals = 1;
529
530                                 dequeue_signals();
531
532                                 /*
533                                  * Check for a higher priority thread that
534                                  * became runnable due to signal handling.
535                                  */
536                                 if (((pthread = PTHREAD_PRIOQ_FIRST()) != NULL) &&
537                                     (pthread->active_priority > pthread_h->active_priority)) {
538                                         /* Remove the thread from the ready queue: */
539                                         PTHREAD_PRIOQ_REMOVE(pthread);
540
541                                         /*
542                                          * Insert the lower priority thread
543                                          * at the head of its priority list:
544                                          */
545                                         PTHREAD_PRIOQ_INSERT_HEAD(pthread_h);
546
547                                         /* There's a new thread in town: */
548                                         pthread_h = pthread;
549                                 }
550
551                                 /* Unprotect the scheduling queues: */
552                                 _queue_signals = 0;
553                         }
554
555                         /* Make the selected thread the current thread: */
556                         _set_curthread(pthread_h);
557                         curthread = pthread_h;
558
559                         /*
560                          * Save the current time as the time that the thread
561                          * became active:
562                          */
563                         current_tick = _sched_ticks;
564                         curthread->last_active = (long) current_tick;
565
566                         /*
567                          * Check if this thread is running for the first time
568                          * or running again after using its full time slice
569                          * allocation:
570                          */
571                         if (curthread->slice_usec == -1) {
572                                 /* Reset the accumulated time slice period: */
573                                 curthread->slice_usec = 0;
574                         }
575
576                         /*
577                          * If we had a context switch, run any
578                          * installed switch hooks.
579                          */
580                         if ((_sched_switch_hook != NULL) &&
581                             (_last_user_thread != curthread)) {
582                                 thread_run_switch_hook(_last_user_thread,
583                                     curthread);
584                         }
585                         /*
586                          * Continue the thread at its current frame:
587                          */
588 #if NOT_YET
589                         _setcontext(&curthread->ctx.uc);
590 #else
591                         ___longjmp(curthread->ctx.jb, 1);
592 #endif
593                         /* This point should not be reached. */
594                         PANIC("Thread has returned from sigreturn or longjmp");
595                 }
596         }
597
598         /* There are no more threads, so exit this process: */
599         exit(0);
600 }
601
602 void
603 _thread_kern_sched_state(enum pthread_state state, char *fname, int lineno)
604 {
605         struct pthread  *curthread = _get_curthread();
606
607         /*
608          * Flag the pthread kernel as executing scheduler code
609          * to avoid a scheduler signal from interrupting this
610          * execution and calling the scheduler again.
611          */
612         _thread_kern_in_sched = 1;
613
614         /*
615          * Prevent the signal handler from fiddling with this thread
616          * before its state is set and is placed into the proper queue.
617          */
618         _queue_signals = 1;
619
620         /* Change the state of the current thread: */
621         curthread->state = state;
622         curthread->fname = fname;
623         curthread->lineno = lineno;
624
625         /* Schedule the next thread that is ready: */
626         _thread_kern_sched(NULL);
627 }
628
629 void
630 _thread_kern_sched_state_unlock(enum pthread_state state,
631     spinlock_t *lock, char *fname, int lineno)
632 {
633         struct pthread  *curthread = _get_curthread();
634
635         /*
636          * Flag the pthread kernel as executing scheduler code
637          * to avoid a scheduler signal from interrupting this
638          * execution and calling the scheduler again.
639          */
640         _thread_kern_in_sched = 1;
641
642         /*
643          * Prevent the signal handler from fiddling with this thread
644          * before its state is set and it is placed into the proper
645          * queue(s).
646          */
647         _queue_signals = 1;
648
649         /* Change the state of the current thread: */
650         curthread->state = state;
651         curthread->fname = fname;
652         curthread->lineno = lineno;
653
654         _SPINUNLOCK(lock);
655
656         /* Schedule the next thread that is ready: */
657         _thread_kern_sched(NULL);
658 }
659
660 static void
661 thread_kern_poll(int wait_reqd)
662 {
663         int             count = 0;
664         int             i, found;
665         int             kern_pipe_added = 0;
666         int             nfds = 0;
667         int             timeout_ms = 0;
668         struct pthread  *pthread;
669         struct timespec ts;
670         struct timeval  tv;
671
672         /* Check if the caller wants to wait: */
673         if (wait_reqd == 0) {
674                 timeout_ms = 0;
675         }
676         else {
677                 /* Get the current time of day: */
678                 GET_CURRENT_TOD(tv);
679                 TIMEVAL_TO_TIMESPEC(&tv, &ts);
680
681                 _queue_signals = 1;
682                 pthread = TAILQ_FIRST(&_waitingq);
683                 _queue_signals = 0;
684
685                 if ((pthread == NULL) || (pthread->wakeup_time.tv_sec == -1)) {
686                         /*
687                          * Either there are no threads in the waiting queue,
688                          * or there are no threads that can timeout.
689                          */
690                         timeout_ms = INFTIM;
691                 }
692                 else if (pthread->wakeup_time.tv_sec - ts.tv_sec > 60000)
693                         /* Limit maximum timeout to prevent rollover. */
694                         timeout_ms = 60000;
695                 else {
696                         /*
697                          * Calculate the time left for the next thread to
698                          * timeout:
699                          */
700                         timeout_ms = ((pthread->wakeup_time.tv_sec - ts.tv_sec) *
701                             1000) + ((pthread->wakeup_time.tv_nsec - ts.tv_nsec) /
702                             1000000);
703                         /*
704                          * Don't allow negative timeouts:
705                          */
706                         if (timeout_ms < 0)
707                                 timeout_ms = 0;
708                 }
709         }
710                         
711         /* Protect the scheduling queues: */
712         _queue_signals = 1;
713
714         /*
715          * Check to see if the signal queue needs to be walked to look
716          * for threads awoken by a signal while in the scheduler.
717          */
718         if (_sigq_check_reqd != 0) {
719                 /* Reset flag before handling queued signals: */
720                 _sigq_check_reqd = 0;
721
722                 dequeue_signals();
723         }
724
725         /*
726          * Check for a thread that became runnable due to a signal:
727          */
728         if (PTHREAD_PRIOQ_FIRST() != NULL) {
729                 /*
730                  * Since there is at least one runnable thread,
731                  * disable the wait.
732                  */
733                 timeout_ms = 0;
734         }
735
736         /*
737          * Form the poll table:
738          */
739         nfds = 0;
740         if (timeout_ms != 0) {
741                 /* Add the kernel pipe to the poll table: */
742                 _thread_pfd_table[nfds].fd = _thread_kern_pipe[0];
743                 _thread_pfd_table[nfds].events = POLLRDNORM;
744                 _thread_pfd_table[nfds].revents = 0;
745                 nfds++;
746                 kern_pipe_added = 1;
747         }
748
749         PTHREAD_WAITQ_SETACTIVE();
750         TAILQ_FOREACH(pthread, &_workq, qe) {
751                 switch (pthread->state) {
752                 case PS_SPINBLOCK:
753                         /*
754                          * If the lock is available, let the thread run.
755                          */
756                         if (pthread->data.spinlock->access_lock == 0) {
757                                 PTHREAD_WAITQ_CLEARACTIVE();
758                                 PTHREAD_WORKQ_REMOVE(pthread);
759                                 PTHREAD_NEW_STATE(pthread,PS_RUNNING);
760                                 PTHREAD_WAITQ_SETACTIVE();
761                                 /* One less thread in a spinblock state: */
762                                 _spinblock_count--;
763                                 /*
764                                  * Since there is at least one runnable
765                                  * thread, disable the wait.
766                                  */
767                                 timeout_ms = 0;
768                         }
769                         break;
770
771                 /* File descriptor read wait: */
772                 case PS_FDR_WAIT:
773                         /* Limit number of polled files to table size: */
774                         if (nfds < _thread_dtablesize) {
775                                 _thread_pfd_table[nfds].events = POLLRDNORM;
776                                 _thread_pfd_table[nfds].fd = pthread->data.fd.fd;
777                                 nfds++;
778                         }
779                         break;
780
781                 /* File descriptor write wait: */
782                 case PS_FDW_WAIT:
783                         /* Limit number of polled files to table size: */
784                         if (nfds < _thread_dtablesize) {
785                                 _thread_pfd_table[nfds].events = POLLWRNORM;
786                                 _thread_pfd_table[nfds].fd = pthread->data.fd.fd;
787                                 nfds++;
788                         }
789                         break;
790
791                 /* File descriptor poll or select wait: */
792                 case PS_POLL_WAIT:
793                 case PS_SELECT_WAIT:
794                         /* Limit number of polled files to table size: */
795                         if (pthread->data.poll_data->nfds + nfds <
796                             _thread_dtablesize) {
797                                 for (i = 0; i < pthread->data.poll_data->nfds; i++) {
798                                         _thread_pfd_table[nfds + i].fd =
799                                             pthread->data.poll_data->fds[i].fd;
800                                         _thread_pfd_table[nfds + i].events =
801                                             pthread->data.poll_data->fds[i].events;
802                                 }
803                                 nfds += pthread->data.poll_data->nfds;
804                         }
805                         break;
806
807                 /* Other states do not depend on file I/O. */
808                 default:
809                         break;
810                 }
811         }
812         PTHREAD_WAITQ_CLEARACTIVE();
813
814         /*
815          * Wait for a file descriptor to be ready for read, write, or
816          * an exception, or a timeout to occur:
817          */
818         count = __sys_poll(_thread_pfd_table, nfds, timeout_ms);
819
820         if (kern_pipe_added != 0)
821                 /*
822                  * Remove the pthread kernel pipe file descriptor
823                  * from the pollfd table:
824                  */
825                 nfds = 1;
826         else
827                 nfds = 0;
828
829         /*
830          * Check if it is possible that there are bytes in the kernel
831          * read pipe waiting to be read:
832          */
833         if (count < 0 || ((kern_pipe_added != 0) &&
834             (_thread_pfd_table[0].revents & POLLRDNORM))) {
835                 /*
836                  * If the kernel read pipe was included in the
837                  * count:
838                  */
839                 if (count > 0) {
840                         /* Decrement the count of file descriptors: */
841                         count--;
842                 }
843
844                 if (_sigq_check_reqd != 0) {
845                         /* Reset flag before handling signals: */
846                         _sigq_check_reqd = 0;
847
848                         dequeue_signals();
849                 }
850         }
851
852         /*
853          * Check if any file descriptors are ready:
854          */
855         if (count > 0) {
856                 /*
857                  * Enter a loop to look for threads waiting on file
858                  * descriptors that are flagged as available by the
859                  * _poll syscall:
860                  */
861                 PTHREAD_WAITQ_SETACTIVE();
862                 TAILQ_FOREACH(pthread, &_workq, qe) {
863                         switch (pthread->state) {
864                         case PS_SPINBLOCK:
865                                 /*
866                                  * If the lock is available, let the thread run.
867                                  */
868                                 if (pthread->data.spinlock->access_lock == 0) {
869                                         PTHREAD_WAITQ_CLEARACTIVE();
870                                         PTHREAD_WORKQ_REMOVE(pthread);
871                                         PTHREAD_NEW_STATE(pthread,PS_RUNNING);
872                                         PTHREAD_WAITQ_SETACTIVE();
873
874                                         /*
875                                          * One less thread in a spinblock state:
876                                          */
877                                         _spinblock_count--;
878                                 }
879                                 break;
880
881                         /* File descriptor read wait: */
882                         case PS_FDR_WAIT:
883                                 if ((nfds < _thread_dtablesize) &&
884                                     ((_thread_pfd_table[nfds].revents
885                                     & (POLLRDNORM | POLLHUP
886                                       | POLLERR | POLLNVAL)) != 0)) {
887                                         PTHREAD_WAITQ_CLEARACTIVE();
888                                         PTHREAD_WORKQ_REMOVE(pthread);
889                                         PTHREAD_NEW_STATE(pthread,PS_RUNNING);
890                                         PTHREAD_WAITQ_SETACTIVE();
891                                 }
892                                 nfds++;
893                                 break;
894
895                         /* File descriptor write wait: */
896                         case PS_FDW_WAIT:
897                                 if ((nfds < _thread_dtablesize) &&
898                                     ((_thread_pfd_table[nfds].revents
899                                     & (POLLWRNORM | POLLHUP
900                                       | POLLERR | POLLNVAL)) != 0)) {
901                                         PTHREAD_WAITQ_CLEARACTIVE();
902                                         PTHREAD_WORKQ_REMOVE(pthread);
903                                         PTHREAD_NEW_STATE(pthread,PS_RUNNING);
904                                         PTHREAD_WAITQ_SETACTIVE();
905                                 }
906                                 nfds++;
907                                 break;
908
909                         /* File descriptor poll or select wait: */
910                         case PS_POLL_WAIT:
911                         case PS_SELECT_WAIT:
912                                 if (pthread->data.poll_data->nfds + nfds <
913                                     _thread_dtablesize) {
914                                         /*
915                                          * Enter a loop looking for I/O
916                                          * readiness:
917                                          */
918                                         found = 0;
919                                         for (i = 0; i < pthread->data.poll_data->nfds; i++) {
920                                                 if (_thread_pfd_table[nfds + i].revents != 0) {
921                                                         pthread->data.poll_data->fds[i].revents =
922                                                             _thread_pfd_table[nfds + i].revents;
923                                                         found++;
924                                                 }
925                                         }
926
927                                         /* Increment before destroying: */
928                                         nfds += pthread->data.poll_data->nfds;
929
930                                         if (found != 0) {
931                                                 pthread->data.poll_data->nfds = found;
932                                                 PTHREAD_WAITQ_CLEARACTIVE();
933                                                 PTHREAD_WORKQ_REMOVE(pthread);
934                                                 PTHREAD_NEW_STATE(pthread,PS_RUNNING);
935                                                 PTHREAD_WAITQ_SETACTIVE();
936                                         }
937                                 }
938                                 else
939                                         nfds += pthread->data.poll_data->nfds;
940                                 break;
941
942                         /* Other states do not depend on file I/O. */
943                         default:
944                                 break;
945                         }
946                 }
947                 PTHREAD_WAITQ_CLEARACTIVE();
948         }
949         else if (_spinblock_count != 0) {
950                 /*
951                  * Enter a loop to look for threads waiting on a spinlock
952                  * that is now available.
953                  */
954                 PTHREAD_WAITQ_SETACTIVE();
955                 TAILQ_FOREACH(pthread, &_workq, qe) {
956                         if (pthread->state == PS_SPINBLOCK) {
957                                 /*
958                                  * If the lock is available, let the thread run.
959                                  */
960                                 if (pthread->data.spinlock->access_lock == 0) {
961                                         PTHREAD_WAITQ_CLEARACTIVE();
962                                         PTHREAD_WORKQ_REMOVE(pthread);
963                                         PTHREAD_NEW_STATE(pthread,PS_RUNNING);
964                                         PTHREAD_WAITQ_SETACTIVE();
965
966                                         /*
967                                          * One less thread in a spinblock state:
968                                          */
969                                         _spinblock_count--;
970                                 }
971                         }
972                 }
973                 PTHREAD_WAITQ_CLEARACTIVE();
974         }
975
976         /* Unprotect the scheduling queues: */
977         _queue_signals = 0;
978
979         while (_sigq_check_reqd != 0) {
980                 /* Handle queued signals: */
981                 _sigq_check_reqd = 0;
982
983                 /* Protect the scheduling queues: */
984                 _queue_signals = 1;
985
986                 dequeue_signals();
987
988                 /* Unprotect the scheduling queues: */
989                 _queue_signals = 0;
990         }
991 }
992
993 void
994 _thread_kern_set_timeout(const struct timespec * timeout)
995 {
996         struct pthread  *curthread = _get_curthread();
997         struct timespec current_time;
998         struct timeval  tv;
999
1000         /* Reset the timeout flag for the running thread: */
1001         curthread->timeout = 0;
1002
1003         /* Check if the thread is to wait forever: */
1004         if (timeout == NULL) {
1005                 /*
1006                  * Set the wakeup time to something that can be recognised as
1007                  * different to an actual time of day:
1008                  */
1009                 curthread->wakeup_time.tv_sec = -1;
1010                 curthread->wakeup_time.tv_nsec = -1;
1011         }
1012         /* Check if no waiting is required: */
1013         else if (timeout->tv_sec == 0 && timeout->tv_nsec == 0) {
1014                 /* Set the wake up time to 'immediately': */
1015                 curthread->wakeup_time.tv_sec = 0;
1016                 curthread->wakeup_time.tv_nsec = 0;
1017         } else {
1018                 /* Get the current time: */
1019                 GET_CURRENT_TOD(tv);
1020                 TIMEVAL_TO_TIMESPEC(&tv, &current_time);
1021
1022                 /* Calculate the time for the current thread to wake up: */
1023                 curthread->wakeup_time.tv_sec = current_time.tv_sec + timeout->tv_sec;
1024                 curthread->wakeup_time.tv_nsec = current_time.tv_nsec + timeout->tv_nsec;
1025
1026                 /* Check if the nanosecond field needs to wrap: */
1027                 if (curthread->wakeup_time.tv_nsec >= 1000000000) {
1028                         /* Wrap the nanosecond field: */
1029                         curthread->wakeup_time.tv_sec += 1;
1030                         curthread->wakeup_time.tv_nsec -= 1000000000;
1031                 }
1032         }
1033 }
1034
1035 void
1036 _thread_kern_sig_defer(void)
1037 {
1038         struct pthread  *curthread = _get_curthread();
1039
1040         /* Allow signal deferral to be recursive. */
1041         curthread->sig_defer_count++;
1042 }
1043
1044 void
1045 _thread_kern_sig_undefer(void)
1046 {
1047         struct pthread  *curthread = _get_curthread();
1048
1049         /*
1050          * Perform checks to yield only if we are about to undefer
1051          * signals.
1052          */
1053         if (curthread->sig_defer_count > 1) {
1054                 /* Decrement the signal deferral count. */
1055                 curthread->sig_defer_count--;
1056         }
1057         else if (curthread->sig_defer_count == 1) {
1058                 /* Reenable signals: */
1059                 curthread->sig_defer_count = 0;
1060
1061                 /*
1062                  * Check if there are queued signals:
1063                  */
1064                 if (_sigq_check_reqd != 0)
1065                         _thread_kern_sched(NULL);
1066
1067                 /*
1068                  * Check for asynchronous cancellation before delivering any
1069                  * pending signals:
1070                  */
1071                 if (((curthread->cancelflags & PTHREAD_AT_CANCEL_POINT) == 0) &&
1072                     ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
1073                         pthread_testcancel();
1074
1075                 /*
1076                  * If there are pending signals or this thread has
1077                  * to yield the CPU, call the kernel scheduler:
1078                  *
1079                  * XXX - Come back and revisit the pending signal problem
1080                  */
1081                 if ((curthread->yield_on_sig_undefer != 0) ||
1082                     SIGNOTEMPTY(curthread->sigpend)) {
1083                         curthread->yield_on_sig_undefer = 0;
1084                         _thread_kern_sched(NULL);
1085                 }
1086         }
1087 }
1088
1089 static void
1090 dequeue_signals(void)
1091 {
1092         char    bufr[128];
1093         int     num;
1094
1095         /*
1096          * Enter a loop to clear the pthread kernel pipe:
1097          */
1098         while (((num = __sys_read(_thread_kern_pipe[0], bufr,
1099             sizeof(bufr))) > 0) || (num == -1 && errno == EINTR)) {
1100         }
1101         if ((num < 0) && (errno != EAGAIN)) {
1102                 /*
1103                  * The only error we should expect is if there is
1104                  * no data to read.
1105                  */
1106                 PANIC("Unable to read from thread kernel pipe");
1107         }
1108         /* Handle any pending signals: */
1109         _thread_sig_handle_pending();
1110 }
1111
1112 static inline void
1113 thread_run_switch_hook(pthread_t thread_out, pthread_t thread_in)
1114 {
1115         pthread_t tid_out = thread_out;
1116         pthread_t tid_in = thread_in;
1117
1118         if ((tid_out != NULL) &&
1119             (tid_out->flags & PTHREAD_FLAGS_PRIVATE) != 0)
1120                 tid_out = NULL;
1121         if ((tid_in != NULL) &&
1122             (tid_in->flags & PTHREAD_FLAGS_PRIVATE) != 0)
1123                 tid_in = NULL;
1124
1125         if ((_sched_switch_hook != NULL) && (tid_out != tid_in)) {
1126                 /* Run the scheduler switch hook: */
1127                 _sched_switch_hook(tid_out, tid_in);
1128         }
1129 }
1130
1131 struct pthread *
1132 _get_curthread(void)
1133 {
1134         if (_thread_initial == NULL)
1135                 _thread_init();
1136
1137         return (_thread_run);
1138 }
1139
1140 void
1141 _set_curthread(struct pthread *newthread)
1142 {
1143         _thread_run = newthread;
1144 }