bind - Upgraded vendor branch to 9.5.2-P1
[dragonfly.git] / contrib / bind-9.5.2 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2009  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: socket.c,v 1.275.10.42 2009/09/07 02:14:40 marka Exp $ */
19
20 /*! \file */
21
22 #include <config.h>
23
24 #include <sys/param.h>
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/stat.h>
28 #include <sys/time.h>
29 #include <sys/uio.h>
30
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <stddef.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37
38 #include <isc/buffer.h>
39 #include <isc/bufferlist.h>
40 #include <isc/condition.h>
41 #include <isc/formatcheck.h>
42 #include <isc/list.h>
43 #include <isc/log.h>
44 #include <isc/mem.h>
45 #include <isc/msgs.h>
46 #include <isc/mutex.h>
47 #include <isc/net.h>
48 #include <isc/once.h>
49 #include <isc/platform.h>
50 #include <isc/print.h>
51 #include <isc/region.h>
52 #include <isc/socket.h>
53 #include <isc/stats.h>
54 #include <isc/strerror.h>
55 #include <isc/task.h>
56 #include <isc/thread.h>
57 #include <isc/util.h>
58 #include <isc/xml.h>
59
60 #ifdef ISC_PLATFORM_HAVESYSUNH
61 #include <sys/un.h>
62 #endif
63 #ifdef ISC_PLATFORM_HAVEKQUEUE
64 #include <sys/event.h>
65 #endif
66 #ifdef ISC_PLATFORM_HAVEEPOLL
67 #include <sys/epoll.h>
68 #endif
69 #ifdef ISC_PLATFORM_HAVEDEVPOLL
70 #include <sys/devpoll.h>
71 #endif
72
73 #include "errno2result.h"
74
75 #ifndef ISC_PLATFORM_USETHREADS
76 #include "socket_p.h"
77 #endif /* ISC_PLATFORM_USETHREADS */
78
79 #if defined(SO_BSDCOMPAT) && defined(__linux__)
80 #include <sys/utsname.h>
81 #endif
82
83 /*%
84  * Choose the most preferable multiplex method.
85  */
86 #ifdef ISC_PLATFORM_HAVEKQUEUE
87 #define USE_KQUEUE
88 #elif defined (ISC_PLATFORM_HAVEEPOLL)
89 #define USE_EPOLL
90 #elif defined (ISC_PLATFORM_HAVEDEVPOLL)
91 #define USE_DEVPOLL
92 typedef struct {
93         unsigned int want_read : 1,
94                 want_write : 1;
95 } pollinfo_t;
96 #else
97 #define USE_SELECT
98 #endif  /* ISC_PLATFORM_HAVEKQUEUE */
99
100 #ifndef ISC_PLATFORM_USETHREADS
101 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
102 struct isc_socketwait {
103         int nevents;
104 };
105 #elif defined (USE_SELECT)
106 struct isc_socketwait {
107         fd_set *readset;
108         fd_set *writeset;
109         int nfds;
110         int maxfd;
111 };
112 #endif  /* USE_KQUEUE */
113 #endif /* !ISC_PLATFORM_USETHREADS */
114
115 /*%
116  * Maximum number of allowable open sockets.  This is also the maximum
117  * allowable socket file descriptor.
118  *
119  * Care should be taken before modifying this value for select():
120  * The API standard doesn't ensure select() accept more than (the system default
121  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
122  * the vast majority of cases.  This constant should therefore be increased only
123  * when absolutely necessary and possible, i.e., the server is exhausting all
124  * available file descriptors (up to FD_SETSIZE) and the select() function
125  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
126  * always by true, but we keep using some of them to ensure as much
127  * portability as possible).  Note also that overall server performance
128  * may be rather worsened with a larger value of this constant due to
129  * inherent scalability problems of select().
130  *
131  * As a special note, this value shouldn't have to be touched if
132  * this is a build for an authoritative only DNS server.
133  */
134 #ifndef ISC_SOCKET_MAXSOCKETS
135 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
136 #define ISC_SOCKET_MAXSOCKETS 4096
137 #elif defined(USE_SELECT)
138 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
139 #endif  /* USE_KQUEUE... */
140 #endif  /* ISC_SOCKET_MAXSOCKETS */
141
142 #ifdef USE_SELECT
143 /*%
144  * Mac OS X needs a special definition to support larger values in select().
145  * We always define this because a larger value can be specified run-time.
146  */
147 #ifdef __APPLE__
148 #define _DARWIN_UNLIMITED_SELECT
149 #endif  /* __APPLE__ */
150 #endif  /* USE_SELECT */
151
152 #ifdef ISC_SOCKET_USE_POLLWATCH
153 /*%
154  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
155  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
156  * some of the specified FD.  The idea is based on the observation that it's
157  * likely for a busy server to keep receiving packets.  It specifically works
158  * as follows: the socket watcher is first initialized with the state of
159  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
160  * event occurs.  When it wakes up for a socket I/O event, it moves to the
161  * poll_active state, and sets the poll timeout to a short period
162  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
163  * watcher goes to the poll_checking state with the same timeout period.
164  * In this state, the watcher tries to detect whether this is a break
165  * during intermittent events or the kernel bug is triggered.  If the next
166  * polling reports an event within the short period, the previous timeout is
167  * likely to be a kernel bug, and so the watcher goes back to the active state.
168  * Otherwise, it moves to the idle state again.
169  *
170  * It's not clear whether this is a thread-related bug, but since we've only
171  * seen this with threads, this workaround is used only when enabling threads.
172  */
173
174 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
175
176 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
177 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
178 #endif  /* ISC_SOCKET_POLLWATCH_TIMEOUT */
179 #endif  /* ISC_SOCKET_USE_POLLWATCH */
180
181 /*%
182  * Size of per-FD lock buckets.
183  */
184 #ifdef ISC_PLATFORM_USETHREADS
185 #define FDLOCK_COUNT            1024
186 #define FDLOCK_ID(fd)           ((fd) % FDLOCK_COUNT)
187 #else
188 #define FDLOCK_COUNT            1
189 #define FDLOCK_ID(fd)           0
190 #endif  /* ISC_PLATFORM_USETHREADS */
191
192 /*%
193  * Maximum number of events communicated with the kernel.  There should normally
194  * be no need for having a large number.
195  */
196 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
197 #ifndef ISC_SOCKET_MAXEVENTS
198 #define ISC_SOCKET_MAXEVENTS    64
199 #endif
200 #endif
201
202 /*%
203  * Some systems define the socket length argument as an int, some as size_t,
204  * some as socklen_t.  This is here so it can be easily changed if needed.
205  */
206 #ifndef ISC_SOCKADDR_LEN_T
207 #define ISC_SOCKADDR_LEN_T unsigned int
208 #endif
209
210 /*%
211  * Define what the possible "soft" errors can be.  These are non-fatal returns
212  * of various network related functions, like recv() and so on.
213  *
214  * For some reason, BSDI (and perhaps others) will sometimes return <0
215  * from recv() but will have errno==0.  This is broken, but we have to
216  * work around it here.
217  */
218 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
219                          (e) == EWOULDBLOCK || \
220                          (e) == EINTR || \
221                          (e) == 0)
222
223 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
224
225 /*!<
226  * DLVL(90)  --  Function entry/exit and other tracing.
227  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
228  * DLVL(60)  --  Socket data send/receive
229  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
230  * DLVL(20)  --  Socket creation/destruction.
231  */
232 #define TRACE_LEVEL             90
233 #define CORRECTNESS_LEVEL       70
234 #define IOEVENT_LEVEL           60
235 #define EVENT_LEVEL             50
236 #define CREATION_LEVEL          20
237
238 #define TRACE           DLVL(TRACE_LEVEL)
239 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
240 #define IOEVENT         DLVL(IOEVENT_LEVEL)
241 #define EVENT           DLVL(EVENT_LEVEL)
242 #define CREATION        DLVL(CREATION_LEVEL)
243
244 typedef isc_event_t intev_t;
245
246 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
247 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
248
249 /*!
250  * IPv6 control information.  If the socket is an IPv6 socket we want
251  * to collect the destination address and interface so the client can
252  * set them on outgoing packets.
253  */
254 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
255 #ifndef USE_CMSG
256 #define USE_CMSG        1
257 #endif
258 #endif
259
260 /*%
261  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
262  * a setsockopt() like interface to request timestamps, and if the OS
263  * doesn't do it for us, call gettimeofday() on every UDP receive?
264  */
265 #ifdef SO_TIMESTAMP
266 #ifndef USE_CMSG
267 #define USE_CMSG        1
268 #endif
269 #endif
270
271 /*%
272  * The size to raise the receive buffer to (from BIND 8).
273  */
274 #define RCVBUFSIZE (32*1024)
275
276 /*%
277  * The number of times a send operation is repeated if the result is EINTR.
278  */
279 #define NRETRIES 10
280
281 struct isc_socket {
282         /* Not locked. */
283         unsigned int            magic;
284         isc_socketmgr_t        *manager;
285         isc_mutex_t             lock;
286         isc_sockettype_t        type;
287         const isc_statscounter_t        *statsindex;
288
289         /* Locked by socket lock. */
290         ISC_LINK(isc_socket_t)  link;
291         unsigned int            references;
292         int                     fd;
293         int                     pf;
294         char                            name[16];
295         void *                          tag;
296
297         ISC_LIST(isc_socketevent_t)             send_list;
298         ISC_LIST(isc_socketevent_t)             recv_list;
299         ISC_LIST(isc_socket_newconnev_t)        accept_list;
300         isc_socket_connev_t                    *connect_ev;
301
302         /*
303          * Internal events.  Posted when a descriptor is readable or
304          * writable.  These are statically allocated and never freed.
305          * They will be set to non-purgable before use.
306          */
307         intev_t                 readable_ev;
308         intev_t                 writable_ev;
309
310         isc_sockaddr_t          peer_address;  /* remote address */
311
312         unsigned int            pending_recv : 1,
313                                 pending_send : 1,
314                                 pending_accept : 1,
315                                 listener : 1, /* listener socket */
316                                 connected : 1,
317                                 connecting : 1, /* connect pending */
318                                 bound : 1; /* bound to local addr */
319
320 #ifdef ISC_NET_RECVOVERFLOW
321         unsigned char           overflow; /* used for MSG_TRUNC fake */
322 #endif
323
324         char                    *recvcmsgbuf;
325         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
326         char                    *sendcmsgbuf;
327         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
328
329         void                    *fdwatcharg;
330         isc_sockfdwatch_t       fdwatchcb;
331         int                     fdwatchflags;
332         isc_task_t              *fdwatchtask;
333 };
334
335 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
336 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
337
338 struct isc_socketmgr {
339         /* Not locked. */
340         unsigned int            magic;
341         isc_mem_t              *mctx;
342         isc_mutex_t             lock;
343         isc_mutex_t             *fdlock;
344         isc_stats_t             *stats;
345 #ifdef USE_KQUEUE
346         int                     kqueue_fd;
347         int                     nevents;
348         struct kevent           *events;
349 #endif  /* USE_KQUEUE */
350 #ifdef USE_EPOLL
351         int                     epoll_fd;
352         int                     nevents;
353         struct epoll_event      *events;
354 #endif  /* USE_EPOLL */
355 #ifdef USE_DEVPOLL
356         int                     devpoll_fd;
357         int                     nevents;
358         struct pollfd           *events;
359 #endif  /* USE_DEVPOLL */
360 #ifdef USE_SELECT
361         int                     fd_bufsize;
362 #endif  /* USE_SELECT */
363         unsigned int            maxsocks;
364 #ifdef ISC_PLATFORM_USETHREADS
365         int                     pipe_fds[2];
366 #endif
367
368         /* Locked by fdlock. */
369         isc_socket_t           **fds;
370         int                     *fdstate;
371 #ifdef USE_DEVPOLL
372         pollinfo_t              *fdpollinfo;
373 #endif
374
375         /* Locked by manager lock. */
376         ISC_LIST(isc_socket_t)  socklist;
377 #ifdef USE_SELECT
378         fd_set                  *read_fds;
379         fd_set                  *read_fds_copy;
380         fd_set                  *write_fds;
381         fd_set                  *write_fds_copy;
382         int                     maxfd;
383 #endif  /* USE_SELECT */
384         int                     reserved;       /* unlocked */
385 #ifdef ISC_PLATFORM_USETHREADS
386         isc_thread_t            watcher;
387         isc_condition_t         shutdown_ok;
388 #else /* ISC_PLATFORM_USETHREADS */
389         unsigned int            refs;
390 #endif /* ISC_PLATFORM_USETHREADS */
391 };
392
393 #ifndef ISC_PLATFORM_USETHREADS
394 static isc_socketmgr_t *socketmgr = NULL;
395 #endif /* ISC_PLATFORM_USETHREADS */
396
397 #define CLOSED                  0       /* this one must be zero */
398 #define MANAGED                 1
399 #define CLOSE_PENDING           2
400
401 /*
402  * send() and recv() iovec counts
403  */
404 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
405 #ifdef ISC_NET_RECVOVERFLOW
406 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
407 #else
408 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
409 #endif
410
411 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
412 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
413 static void free_socket(isc_socket_t **);
414 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
415                                     isc_socket_t **);
416 static void destroy(isc_socket_t **);
417 static void internal_accept(isc_task_t *, isc_event_t *);
418 static void internal_connect(isc_task_t *, isc_event_t *);
419 static void internal_recv(isc_task_t *, isc_event_t *);
420 static void internal_send(isc_task_t *, isc_event_t *);
421 static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
422 static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
423 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
424 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
425                               struct msghdr *, struct iovec *, size_t *);
426 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
427                               struct msghdr *, struct iovec *, size_t *);
428 #ifdef ISC_PLATFORM_USETHREADS
429 static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager);
430 #endif
431
432 #define SELECT_POKE_SHUTDOWN            (-1)
433 #define SELECT_POKE_NOTHING             (-2)
434 #define SELECT_POKE_READ                (-3)
435 #define SELECT_POKE_ACCEPT              (-3) /*%< Same as _READ */
436 #define SELECT_POKE_WRITE               (-4)
437 #define SELECT_POKE_CONNECT             (-4) /*%< Same as _WRITE */
438 #define SELECT_POKE_CLOSE               (-5)
439
440 #define SOCK_DEAD(s)                    ((s)->references == 0)
441
442 /*%
443  * Shortcut index arrays to get access to statistics counters.
444  */
445 enum {
446         STATID_OPEN = 0,
447         STATID_OPENFAIL = 1,
448         STATID_CLOSE = 2,
449         STATID_BINDFAIL = 3,
450         STATID_CONNECTFAIL = 4,
451         STATID_CONNECT = 5,
452         STATID_ACCEPTFAIL = 6,
453         STATID_ACCEPT = 7,
454         STATID_SENDFAIL = 8,
455         STATID_RECVFAIL = 9
456 };
457 static const isc_statscounter_t upd4statsindex[] = {
458         isc_sockstatscounter_udp4open,
459         isc_sockstatscounter_udp4openfail,
460         isc_sockstatscounter_udp4close,
461         isc_sockstatscounter_udp4bindfail,
462         isc_sockstatscounter_udp4connectfail,
463         isc_sockstatscounter_udp4connect,
464         -1,
465         -1,
466         isc_sockstatscounter_udp4sendfail,
467         isc_sockstatscounter_udp4recvfail
468 };
469 static const isc_statscounter_t upd6statsindex[] = {
470         isc_sockstatscounter_udp6open,
471         isc_sockstatscounter_udp6openfail,
472         isc_sockstatscounter_udp6close,
473         isc_sockstatscounter_udp6bindfail,
474         isc_sockstatscounter_udp6connectfail,
475         isc_sockstatscounter_udp6connect,
476         -1,
477         -1,
478         isc_sockstatscounter_udp6sendfail,
479         isc_sockstatscounter_udp6recvfail
480 };
481 static const isc_statscounter_t tcp4statsindex[] = {
482         isc_sockstatscounter_tcp4open,
483         isc_sockstatscounter_tcp4openfail,
484         isc_sockstatscounter_tcp4close,
485         isc_sockstatscounter_tcp4bindfail,
486         isc_sockstatscounter_tcp4connectfail,
487         isc_sockstatscounter_tcp4connect,
488         isc_sockstatscounter_tcp4acceptfail,
489         isc_sockstatscounter_tcp4accept,
490         isc_sockstatscounter_tcp4sendfail,
491         isc_sockstatscounter_tcp4recvfail
492 };
493 static const isc_statscounter_t tcp6statsindex[] = {
494         isc_sockstatscounter_tcp6open,
495         isc_sockstatscounter_tcp6openfail,
496         isc_sockstatscounter_tcp6close,
497         isc_sockstatscounter_tcp6bindfail,
498         isc_sockstatscounter_tcp6connectfail,
499         isc_sockstatscounter_tcp6connect,
500         isc_sockstatscounter_tcp6acceptfail,
501         isc_sockstatscounter_tcp6accept,
502         isc_sockstatscounter_tcp6sendfail,
503         isc_sockstatscounter_tcp6recvfail
504 };
505 static const isc_statscounter_t unixstatsindex[] = {
506         isc_sockstatscounter_unixopen,
507         isc_sockstatscounter_unixopenfail,
508         isc_sockstatscounter_unixclose,
509         isc_sockstatscounter_unixbindfail,
510         isc_sockstatscounter_unixconnectfail,
511         isc_sockstatscounter_unixconnect,
512         isc_sockstatscounter_unixacceptfail,
513         isc_sockstatscounter_unixaccept,
514         isc_sockstatscounter_unixsendfail,
515         isc_sockstatscounter_unixrecvfail
516 };
517 static const isc_statscounter_t fdwatchstatsindex[] = {
518         -1,
519         -1,
520         isc_sockstatscounter_fdwatchclose,
521         isc_sockstatscounter_fdwatchbindfail,
522         isc_sockstatscounter_fdwatchconnectfail,
523         isc_sockstatscounter_fdwatchconnect,
524         -1,
525         -1,
526         isc_sockstatscounter_fdwatchsendfail,
527         isc_sockstatscounter_fdwatchrecvfail
528 };
529
530 static void
531 manager_log(isc_socketmgr_t *sockmgr,
532             isc_logcategory_t *category, isc_logmodule_t *module, int level,
533             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
534 static void
535 manager_log(isc_socketmgr_t *sockmgr,
536             isc_logcategory_t *category, isc_logmodule_t *module, int level,
537             const char *fmt, ...)
538 {
539         char msgbuf[2048];
540         va_list ap;
541
542         if (! isc_log_wouldlog(isc_lctx, level))
543                 return;
544
545         va_start(ap, fmt);
546         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
547         va_end(ap);
548
549         isc_log_write(isc_lctx, category, module, level,
550                       "sockmgr %p: %s", sockmgr, msgbuf);
551 }
552
553 static void
554 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
555            isc_logcategory_t *category, isc_logmodule_t *module, int level,
556            isc_msgcat_t *msgcat, int msgset, int message,
557            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
558 static void
559 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
560            isc_logcategory_t *category, isc_logmodule_t *module, int level,
561            isc_msgcat_t *msgcat, int msgset, int message,
562            const char *fmt, ...)
563 {
564         char msgbuf[2048];
565         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
566         va_list ap;
567
568         if (! isc_log_wouldlog(isc_lctx, level))
569                 return;
570
571         va_start(ap, fmt);
572         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
573         va_end(ap);
574
575         if (address == NULL) {
576                 isc_log_iwrite(isc_lctx, category, module, level,
577                                msgcat, msgset, message,
578                                "socket %p: %s", sock, msgbuf);
579         } else {
580                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
581                 isc_log_iwrite(isc_lctx, category, module, level,
582                                msgcat, msgset, message,
583                                "socket %p %s: %s", sock, peerbuf, msgbuf);
584         }
585 }
586
587 #if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
588     defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
589 /*
590  * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
591  * setting IPV6_V6ONLY.
592  */
593 static void
594 FIX_IPV6_RECVPKTINFO(isc_socket_t *sock)
595 {
596         char strbuf[ISC_STRERRORSIZE];
597         int on = 1;
598
599         if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
600                 return;
601
602         if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
603                        (void *)&on, sizeof(on)) < 0) {
604
605                 UNEXPECTED_ERROR(__FILE__, __LINE__,
606                                  "setsockopt(%d, IPV6_RECVPKTINFO) "
607                                  "%s: %s", sock->fd,
608                                  isc_msgcat_get(isc_msgcat,
609                                                 ISC_MSGSET_GENERAL,
610                                                 ISC_MSG_FAILED,
611                                                 "failed"),
612                                  strbuf);
613         }
614 }
615 #else
616 #define FIX_IPV6_RECVPKTINFO(sock) (void)0
617 #endif
618
619 /*%
620  * Increment socket-related statistics counters.
621  */
622 static inline void
623 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
624         REQUIRE(counterid != -1);
625
626         if (stats != NULL)
627                 isc_stats_increment(stats, counterid);
628 }
629
630 static inline isc_result_t
631 watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
632         isc_result_t result = ISC_R_SUCCESS;
633
634 #ifdef USE_KQUEUE
635         struct kevent evchange;
636
637         memset(&evchange, 0, sizeof(evchange));
638         if (msg == SELECT_POKE_READ)
639                 evchange.filter = EVFILT_READ;
640         else
641                 evchange.filter = EVFILT_WRITE;
642         evchange.flags = EV_ADD;
643         evchange.ident = fd;
644         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
645                 result = isc__errno2result(errno);
646
647         return (result);
648 #elif defined(USE_EPOLL)
649         struct epoll_event event;
650
651         if (msg == SELECT_POKE_READ)
652                 event.events = EPOLLIN;
653         else
654                 event.events = EPOLLOUT;
655         event.data.fd = fd;
656         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
657             errno != EEXIST) {
658                 result = isc__errno2result(errno);
659         }
660
661         return (result);
662 #elif defined(USE_DEVPOLL)
663         struct pollfd pfd;
664         int lockid = FDLOCK_ID(fd);
665
666         memset(&pfd, 0, sizeof(pfd));
667         if (msg == SELECT_POKE_READ)
668                 pfd.events = POLLIN;
669         else
670                 pfd.events = POLLOUT;
671         pfd.fd = fd;
672         pfd.revents = 0;
673         LOCK(&manager->fdlock[lockid]);
674         if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
675                 result = isc__errno2result(errno);
676         else {
677                 if (msg == SELECT_POKE_READ)
678                         manager->fdpollinfo[fd].want_read = 1;
679                 else
680                         manager->fdpollinfo[fd].want_write = 1;
681         }
682         UNLOCK(&manager->fdlock[lockid]);
683
684         return (result);
685 #elif defined(USE_SELECT)
686         LOCK(&manager->lock);
687         if (msg == SELECT_POKE_READ)
688                 FD_SET(fd, manager->read_fds);
689         if (msg == SELECT_POKE_WRITE)
690                 FD_SET(fd, manager->write_fds);
691         UNLOCK(&manager->lock);
692
693         return (result);
694 #endif
695 }
696
697 static inline isc_result_t
698 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
699         isc_result_t result = ISC_R_SUCCESS;
700
701 #ifdef USE_KQUEUE
702         struct kevent evchange;
703
704         memset(&evchange, 0, sizeof(evchange));
705         if (msg == SELECT_POKE_READ)
706                 evchange.filter = EVFILT_READ;
707         else
708                 evchange.filter = EVFILT_WRITE;
709         evchange.flags = EV_DELETE;
710         evchange.ident = fd;
711         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
712                 result = isc__errno2result(errno);
713
714         return (result);
715 #elif defined(USE_EPOLL)
716         struct epoll_event event;
717
718         if (msg == SELECT_POKE_READ)
719                 event.events = EPOLLIN;
720         else
721                 event.events = EPOLLOUT;
722         event.data.fd = fd;
723         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
724             errno != ENOENT) {
725                 char strbuf[ISC_STRERRORSIZE];
726                 isc__strerror(errno, strbuf, sizeof(strbuf));
727                 UNEXPECTED_ERROR(__FILE__, __LINE__,
728                                  "epoll_ctl(DEL), %d: %s", fd, strbuf);
729                 result = ISC_R_UNEXPECTED;
730         }
731         return (result);
732 #elif defined(USE_DEVPOLL)
733         struct pollfd pfds[2];
734         size_t writelen = sizeof(pfds[0]);
735         int lockid = FDLOCK_ID(fd);
736
737         memset(pfds, 0, sizeof(pfds));
738         pfds[0].events = POLLREMOVE;
739         pfds[0].fd = fd;
740
741         /*
742          * Canceling read or write polling via /dev/poll is tricky.  Since it
743          * only provides a way of canceling per FD, we may need to re-poll the
744          * socket for the other operation.
745          */
746         LOCK(&manager->fdlock[lockid]);
747         if (msg == SELECT_POKE_READ &&
748             manager->fdpollinfo[fd].want_write == 1) {
749                 pfds[1].events = POLLOUT;
750                 pfds[1].fd = fd;
751                 writelen += sizeof(pfds[1]);
752         }
753         if (msg == SELECT_POKE_WRITE &&
754             manager->fdpollinfo[fd].want_read == 1) {
755                 pfds[1].events = POLLIN;
756                 pfds[1].fd = fd;
757                 writelen += sizeof(pfds[1]);
758         }
759
760         if (write(manager->devpoll_fd, pfds, writelen) == -1)
761                 result = isc__errno2result(errno);
762         else {
763                 if (msg == SELECT_POKE_READ)
764                         manager->fdpollinfo[fd].want_read = 0;
765                 else
766                         manager->fdpollinfo[fd].want_write = 0;
767         }
768         UNLOCK(&manager->fdlock[lockid]);
769
770         return (result);
771 #elif defined(USE_SELECT)
772         LOCK(&manager->lock);
773         if (msg == SELECT_POKE_READ)
774                 FD_CLR(fd, manager->read_fds);
775         else if (msg == SELECT_POKE_WRITE)
776                 FD_CLR(fd, manager->write_fds);
777         UNLOCK(&manager->lock);
778
779         return (result);
780 #endif
781 }
782
783 static void
784 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
785         isc_result_t result;
786         int lockid = FDLOCK_ID(fd);
787
788         /*
789          * This is a wakeup on a socket.  If the socket is not in the
790          * process of being closed, start watching it for either reads
791          * or writes.
792          */
793
794         INSIST(fd >= 0 && fd < (int)manager->maxsocks);
795
796         if (msg == SELECT_POKE_CLOSE) {
797                 /* No one should be updating fdstate, so no need to lock it */
798                 INSIST(manager->fdstate[fd] == CLOSE_PENDING);
799                 manager->fdstate[fd] = CLOSED;
800                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
801                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
802                 (void)close(fd);
803                 return;
804         }
805
806         LOCK(&manager->fdlock[lockid]);
807         if (manager->fdstate[fd] == CLOSE_PENDING) {
808                 UNLOCK(&manager->fdlock[lockid]);
809
810                 /*
811                  * We accept (and ignore) any error from unwatch_fd() as we are
812                  * closing the socket, hoping it doesn't leave dangling state in
813                  * the kernel.
814                  * Note that unwatch_fd() must be called after releasing the
815                  * fdlock; otherwise it could cause deadlock due to a lock order
816                  * reversal.
817                  */
818                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
819                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
820                 return;
821         }
822         if (manager->fdstate[fd] != MANAGED) {
823                 UNLOCK(&manager->fdlock[lockid]);
824                 return;
825         }
826         UNLOCK(&manager->fdlock[lockid]);
827
828         /*
829          * Set requested bit.
830          */
831         result = watch_fd(manager, fd, msg);
832         if (result != ISC_R_SUCCESS) {
833                 /*
834                  * XXXJT: what should we do?  Ignoring the failure of watching
835                  * a socket will make the application dysfunctional, but there
836                  * seems to be no reasonable recovery process.
837                  */
838                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
839                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
840                               "failed to start watching FD (%d): %s",
841                               fd, isc_result_totext(result));
842         }
843 }
844
845 #ifdef ISC_PLATFORM_USETHREADS
846 /*
847  * Poke the select loop when there is something for us to do.
848  * The write is required (by POSIX) to complete.  That is, we
849  * will not get partial writes.
850  */
851 static void
852 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
853         int cc;
854         int buf[2];
855         char strbuf[ISC_STRERRORSIZE];
856
857         buf[0] = fd;
858         buf[1] = msg;
859
860         do {
861                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
862 #ifdef ENOSR
863                 /*
864                  * Treat ENOSR as EAGAIN but loop slowly as it is
865                  * unlikely to clear fast.
866                  */
867                 if (cc < 0 && errno == ENOSR) {
868                         sleep(1);
869                         errno = EAGAIN;
870                 }
871 #endif
872         } while (cc < 0 && SOFT_ERROR(errno));
873
874         if (cc < 0) {
875                 isc__strerror(errno, strbuf, sizeof(strbuf));
876                 FATAL_ERROR(__FILE__, __LINE__,
877                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
878                                            ISC_MSG_WRITEFAILED,
879                                            "write() failed "
880                                            "during watcher poke: %s"),
881                             strbuf);
882         }
883
884         INSIST(cc == sizeof(buf));
885 }
886
887 /*
888  * Read a message on the internal fd.
889  */
890 static void
891 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
892         int buf[2];
893         int cc;
894         char strbuf[ISC_STRERRORSIZE];
895
896         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
897         if (cc < 0) {
898                 *msg = SELECT_POKE_NOTHING;
899                 *fd = -1;       /* Silence compiler. */
900                 if (SOFT_ERROR(errno))
901                         return;
902
903                 isc__strerror(errno, strbuf, sizeof(strbuf));
904                 FATAL_ERROR(__FILE__, __LINE__,
905                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
906                                            ISC_MSG_READFAILED,
907                                            "read() failed "
908                                            "during watcher poke: %s"),
909                             strbuf);
910
911                 return;
912         }
913         INSIST(cc == sizeof(buf));
914
915         *fd = buf[0];
916         *msg = buf[1];
917 }
918 #else /* ISC_PLATFORM_USETHREADS */
919 /*
920  * Update the state of the socketmgr when something changes.
921  */
922 static void
923 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
924         if (msg == SELECT_POKE_SHUTDOWN)
925                 return;
926         else if (fd >= 0)
927                 wakeup_socket(manager, fd, msg);
928         return;
929 }
930 #endif /* ISC_PLATFORM_USETHREADS */
931
932 /*
933  * Make a fd non-blocking.
934  */
935 static isc_result_t
936 make_nonblock(int fd) {
937         int ret;
938         int flags;
939         char strbuf[ISC_STRERRORSIZE];
940 #ifdef USE_FIONBIO_IOCTL
941         int on = 1;
942
943         ret = ioctl(fd, FIONBIO, (char *)&on);
944 #else
945         flags = fcntl(fd, F_GETFL, 0);
946         flags |= PORT_NONBLOCK;
947         ret = fcntl(fd, F_SETFL, flags);
948 #endif
949
950         if (ret == -1) {
951                 isc__strerror(errno, strbuf, sizeof(strbuf));
952                 UNEXPECTED_ERROR(__FILE__, __LINE__,
953 #ifdef USE_FIONBIO_IOCTL
954                                  "ioctl(%d, FIONBIO, &on): %s", fd,
955 #else
956                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
957 #endif
958                                  strbuf);
959
960                 return (ISC_R_UNEXPECTED);
961         }
962
963         return (ISC_R_SUCCESS);
964 }
965
966 #ifdef USE_CMSG
967 /*
968  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
969  * In order to ensure as much portability as possible, we provide wrapper
970  * functions of these macros.
971  * Note that cmsg_space() could run slow on OSes that do not have
972  * CMSG_SPACE.
973  */
974 static inline ISC_SOCKADDR_LEN_T
975 cmsg_len(ISC_SOCKADDR_LEN_T len) {
976 #ifdef CMSG_LEN
977         return (CMSG_LEN(len));
978 #else
979         ISC_SOCKADDR_LEN_T hdrlen;
980
981         /*
982          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
983          * is correct.
984          */
985         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
986         return (hdrlen + len);
987 #endif
988 }
989
990 static inline ISC_SOCKADDR_LEN_T
991 cmsg_space(ISC_SOCKADDR_LEN_T len) {
992 #ifdef CMSG_SPACE
993         return (CMSG_SPACE(len));
994 #else
995         struct msghdr msg;
996         struct cmsghdr *cmsgp;
997         /*
998          * XXX: The buffer length is an ad-hoc value, but should be enough
999          * in a practical sense.
1000          */
1001         char dummybuf[sizeof(struct cmsghdr) + 1024];
1002
1003         memset(&msg, 0, sizeof(msg));
1004         msg.msg_control = dummybuf;
1005         msg.msg_controllen = sizeof(dummybuf);
1006
1007         cmsgp = (struct cmsghdr *)dummybuf;
1008         cmsgp->cmsg_len = cmsg_len(len);
1009
1010         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1011         if (cmsgp != NULL)
1012                 return ((char *)cmsgp - (char *)msg.msg_control);
1013         else
1014                 return (0);
1015 #endif
1016 }
1017 #endif /* USE_CMSG */
1018
1019 /*
1020  * Process control messages received on a socket.
1021  */
1022 static void
1023 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1024 #ifdef USE_CMSG
1025         struct cmsghdr *cmsgp;
1026 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1027         struct in6_pktinfo *pktinfop;
1028 #endif
1029 #ifdef SO_TIMESTAMP
1030         struct timeval *timevalp;
1031 #endif
1032 #endif
1033
1034         /*
1035          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1036          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1037          * They are all here, outside of the CPP tests, because it is
1038          * more consistent with the usual ISC coding style.
1039          */
1040         UNUSED(sock);
1041         UNUSED(msg);
1042         UNUSED(dev);
1043
1044 #ifdef ISC_NET_BSD44MSGHDR
1045
1046 #ifdef MSG_TRUNC
1047         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
1048                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1049 #endif
1050
1051 #ifdef MSG_CTRUNC
1052         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
1053                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1054 #endif
1055
1056 #ifndef USE_CMSG
1057         return;
1058 #else
1059         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
1060                 return;
1061
1062 #ifdef SO_TIMESTAMP
1063         timevalp = NULL;
1064 #endif
1065 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1066         pktinfop = NULL;
1067 #endif
1068
1069         cmsgp = CMSG_FIRSTHDR(msg);
1070         while (cmsgp != NULL) {
1071                 socket_log(sock, NULL, TRACE,
1072                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
1073                            "processing cmsg %p", cmsgp);
1074
1075 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1076                 if (cmsgp->cmsg_level == IPPROTO_IPV6
1077                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
1078
1079                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1080                         memcpy(&dev->pktinfo, pktinfop,
1081                                sizeof(struct in6_pktinfo));
1082                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1083                         socket_log(sock, NULL, TRACE,
1084                                    isc_msgcat, ISC_MSGSET_SOCKET,
1085                                    ISC_MSG_IFRECEIVED,
1086                                    "interface received on ifindex %u",
1087                                    dev->pktinfo.ipi6_ifindex);
1088                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
1089                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1090                         goto next;
1091                 }
1092 #endif
1093
1094 #ifdef SO_TIMESTAMP
1095                 if (cmsgp->cmsg_level == SOL_SOCKET
1096                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
1097                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
1098                         dev->timestamp.seconds = timevalp->tv_sec;
1099                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
1100                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1101                         goto next;
1102                 }
1103 #endif
1104
1105         next:
1106                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
1107         }
1108 #endif /* USE_CMSG */
1109
1110 #endif /* ISC_NET_BSD44MSGHDR */
1111 }
1112
1113 /*
1114  * Construct an iov array and attach it to the msghdr passed in.  This is
1115  * the SEND constructor, which will use the used region of the buffer
1116  * (if using a buffer list) or will use the internal region (if a single
1117  * buffer I/O is requested).
1118  *
1119  * Nothing can be NULL, and the done event must list at least one buffer
1120  * on the buffer linked list for this function to be meaningful.
1121  *
1122  * If write_countp != NULL, *write_countp will hold the number of bytes
1123  * this transaction can send.
1124  */
1125 static void
1126 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
1127                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1128 {
1129         unsigned int iovcount;
1130         isc_buffer_t *buffer;
1131         isc_region_t used;
1132         size_t write_count;
1133         size_t skip_count;
1134
1135         memset(msg, 0, sizeof(*msg));
1136
1137         if (!sock->connected) {
1138                 msg->msg_name = (void *)&dev->address.type.sa;
1139                 msg->msg_namelen = dev->address.length;
1140         } else {
1141                 msg->msg_name = NULL;
1142                 msg->msg_namelen = 0;
1143         }
1144
1145         buffer = ISC_LIST_HEAD(dev->bufferlist);
1146         write_count = 0;
1147         iovcount = 0;
1148
1149         /*
1150          * Single buffer I/O?  Skip what we've done so far in this region.
1151          */
1152         if (buffer == NULL) {
1153                 write_count = dev->region.length - dev->n;
1154                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1155                 iov[0].iov_len = write_count;
1156                 iovcount = 1;
1157
1158                 goto config;
1159         }
1160
1161         /*
1162          * Multibuffer I/O.
1163          * Skip the data in the buffer list that we have already written.
1164          */
1165         skip_count = dev->n;
1166         while (buffer != NULL) {
1167                 REQUIRE(ISC_BUFFER_VALID(buffer));
1168                 if (skip_count < isc_buffer_usedlength(buffer))
1169                         break;
1170                 skip_count -= isc_buffer_usedlength(buffer);
1171                 buffer = ISC_LIST_NEXT(buffer, link);
1172         }
1173
1174         while (buffer != NULL) {
1175                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
1176
1177                 isc_buffer_usedregion(buffer, &used);
1178
1179                 if (used.length > 0) {
1180                         iov[iovcount].iov_base = (void *)(used.base
1181                                                           + skip_count);
1182                         iov[iovcount].iov_len = used.length - skip_count;
1183                         write_count += (used.length - skip_count);
1184                         skip_count = 0;
1185                         iovcount++;
1186                 }
1187                 buffer = ISC_LIST_NEXT(buffer, link);
1188         }
1189
1190         INSIST(skip_count == 0U);
1191
1192  config:
1193         msg->msg_iov = iov;
1194         msg->msg_iovlen = iovcount;
1195
1196 #ifdef ISC_NET_BSD44MSGHDR
1197         msg->msg_control = NULL;
1198         msg->msg_controllen = 0;
1199         msg->msg_flags = 0;
1200 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1201         if ((sock->type == isc_sockettype_udp)
1202             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1203                 struct cmsghdr *cmsgp;
1204                 struct in6_pktinfo *pktinfop;
1205
1206                 socket_log(sock, NULL, TRACE,
1207                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1208                            "sendto pktinfo data, ifindex %u",
1209                            dev->pktinfo.ipi6_ifindex);
1210
1211                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1212                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1213                 msg->msg_control = (void *)sock->sendcmsgbuf;
1214
1215                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1216                 cmsgp->cmsg_level = IPPROTO_IPV6;
1217                 cmsgp->cmsg_type = IPV6_PKTINFO;
1218                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1219                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1220                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1221         }
1222 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1223 #else /* ISC_NET_BSD44MSGHDR */
1224         msg->msg_accrights = NULL;
1225         msg->msg_accrightslen = 0;
1226 #endif /* ISC_NET_BSD44MSGHDR */
1227
1228         if (write_countp != NULL)
1229                 *write_countp = write_count;
1230 }
1231
1232 /*
1233  * Construct an iov array and attach it to the msghdr passed in.  This is
1234  * the RECV constructor, which will use the available region of the buffer
1235  * (if using a buffer list) or will use the internal region (if a single
1236  * buffer I/O is requested).
1237  *
1238  * Nothing can be NULL, and the done event must list at least one buffer
1239  * on the buffer linked list for this function to be meaningful.
1240  *
1241  * If read_countp != NULL, *read_countp will hold the number of bytes
1242  * this transaction can receive.
1243  */
1244 static void
1245 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
1246                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1247 {
1248         unsigned int iovcount;
1249         isc_buffer_t *buffer;
1250         isc_region_t available;
1251         size_t read_count;
1252
1253         memset(msg, 0, sizeof(struct msghdr));
1254
1255         if (sock->type == isc_sockettype_udp) {
1256                 memset(&dev->address, 0, sizeof(dev->address));
1257 #ifdef BROKEN_RECVMSG
1258                 if (sock->pf == AF_INET) {
1259                         msg->msg_name = (void *)&dev->address.type.sin;
1260                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1261                 } else if (sock->pf == AF_INET6) {
1262                         msg->msg_name = (void *)&dev->address.type.sin6;
1263                         msg->msg_namelen = sizeof(dev->address.type.sin6);
1264 #ifdef ISC_PLATFORM_HAVESYSUNH
1265                 } else if (sock->pf == AF_UNIX) {
1266                         msg->msg_name = (void *)&dev->address.type.sunix;
1267                         msg->msg_namelen = sizeof(dev->address.type.sunix);
1268 #endif
1269                 } else {
1270                         msg->msg_name = (void *)&dev->address.type.sa;
1271                         msg->msg_namelen = sizeof(dev->address.type);
1272                 }
1273 #else
1274                 msg->msg_name = (void *)&dev->address.type.sa;
1275                 msg->msg_namelen = sizeof(dev->address.type);
1276 #endif
1277 #ifdef ISC_NET_RECVOVERFLOW
1278                 /* If needed, steal one iovec for overflow detection. */
1279                 maxiov--;
1280 #endif
1281         } else { /* TCP */
1282                 msg->msg_name = NULL;
1283                 msg->msg_namelen = 0;
1284                 dev->address = sock->peer_address;
1285         }
1286
1287         buffer = ISC_LIST_HEAD(dev->bufferlist);
1288         read_count = 0;
1289
1290         /*
1291          * Single buffer I/O?  Skip what we've done so far in this region.
1292          */
1293         if (buffer == NULL) {
1294                 read_count = dev->region.length - dev->n;
1295                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1296                 iov[0].iov_len = read_count;
1297                 iovcount = 1;
1298
1299                 goto config;
1300         }
1301
1302         /*
1303          * Multibuffer I/O.
1304          * Skip empty buffers.
1305          */
1306         while (buffer != NULL) {
1307                 REQUIRE(ISC_BUFFER_VALID(buffer));
1308                 if (isc_buffer_availablelength(buffer) != 0)
1309                         break;
1310                 buffer = ISC_LIST_NEXT(buffer, link);
1311         }
1312
1313         iovcount = 0;
1314         while (buffer != NULL) {
1315                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
1316
1317                 isc_buffer_availableregion(buffer, &available);
1318
1319                 if (available.length > 0) {
1320                         iov[iovcount].iov_base = (void *)(available.base);
1321                         iov[iovcount].iov_len = available.length;
1322                         read_count += available.length;
1323                         iovcount++;
1324                 }
1325                 buffer = ISC_LIST_NEXT(buffer, link);
1326         }
1327
1328  config:
1329
1330         /*
1331          * If needed, set up to receive that one extra byte.  Note that
1332          * we know there is at least one iov left, since we stole it
1333          * at the top of this function.
1334          */
1335 #ifdef ISC_NET_RECVOVERFLOW
1336         if (sock->type == isc_sockettype_udp) {
1337                 iov[iovcount].iov_base = (void *)(&sock->overflow);
1338                 iov[iovcount].iov_len = 1;
1339                 iovcount++;
1340         }
1341 #endif
1342
1343         msg->msg_iov = iov;
1344         msg->msg_iovlen = iovcount;
1345
1346 #ifdef ISC_NET_BSD44MSGHDR
1347         msg->msg_control = NULL;
1348         msg->msg_controllen = 0;
1349         msg->msg_flags = 0;
1350 #if defined(USE_CMSG)
1351         if (sock->type == isc_sockettype_udp) {
1352                 msg->msg_control = sock->recvcmsgbuf;
1353                 msg->msg_controllen = sock->recvcmsgbuflen;
1354         }
1355 #endif /* USE_CMSG */
1356 #else /* ISC_NET_BSD44MSGHDR */
1357         msg->msg_accrights = NULL;
1358         msg->msg_accrightslen = 0;
1359 #endif /* ISC_NET_BSD44MSGHDR */
1360
1361         if (read_countp != NULL)
1362                 *read_countp = read_count;
1363 }
1364
1365 static void
1366 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1367                 isc_socketevent_t *dev)
1368 {
1369         if (sock->type == isc_sockettype_udp) {
1370                 if (address != NULL)
1371                         dev->address = *address;
1372                 else
1373                         dev->address = sock->peer_address;
1374         } else if (sock->type == isc_sockettype_tcp) {
1375                 INSIST(address == NULL);
1376                 dev->address = sock->peer_address;
1377         }
1378 }
1379
1380 static void
1381 destroy_socketevent(isc_event_t *event) {
1382         isc_socketevent_t *ev = (isc_socketevent_t *)event;
1383
1384         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1385
1386         (ev->destroy)(event);
1387 }
1388
1389 static isc_socketevent_t *
1390 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1391                      isc_taskaction_t action, const void *arg)
1392 {
1393         isc_socketevent_t *ev;
1394
1395         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1396                                                      sock, eventtype,
1397                                                      action, arg,
1398                                                      sizeof(*ev));
1399
1400         if (ev == NULL)
1401                 return (NULL);
1402
1403         ev->result = ISC_R_UNEXPECTED;
1404         ISC_LINK_INIT(ev, ev_link);
1405         ISC_LIST_INIT(ev->bufferlist);
1406         ev->region.base = NULL;
1407         ev->n = 0;
1408         ev->offset = 0;
1409         ev->attributes = 0;
1410         ev->destroy = ev->ev_destroy;
1411         ev->ev_destroy = destroy_socketevent;
1412
1413         return (ev);
1414 }
1415
1416 #if defined(ISC_SOCKET_DEBUG)
1417 static void
1418 dump_msg(struct msghdr *msg) {
1419         unsigned int i;
1420
1421         printf("MSGHDR %p\n", msg);
1422         printf("\tname %p, namelen %ld\n", msg->msg_name,
1423                (long) msg->msg_namelen);
1424         printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1425                (long) msg->msg_iovlen);
1426         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1427                 printf("\t\t%d\tbase %p, len %ld\n", i,
1428                        msg->msg_iov[i].iov_base,
1429                        (long) msg->msg_iov[i].iov_len);
1430 #ifdef ISC_NET_BSD44MSGHDR
1431         printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1432                (long) msg->msg_controllen);
1433 #endif
1434 }
1435 #endif
1436
1437 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
1438 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
1439 #define DOIO_HARD               2       /* i/o error, event sent */
1440 #define DOIO_EOF                3       /* EOF, no event sent */
1441
1442 static int
1443 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1444         int cc;
1445         struct iovec iov[MAXSCATTERGATHER_RECV];
1446         size_t read_count;
1447         size_t actual_count;
1448         struct msghdr msghdr;
1449         isc_buffer_t *buffer;
1450         int recv_errno;
1451         char strbuf[ISC_STRERRORSIZE];
1452
1453         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1454
1455 #if defined(ISC_SOCKET_DEBUG)
1456         dump_msg(&msghdr);
1457 #endif
1458
1459         cc = recvmsg(sock->fd, &msghdr, 0);
1460         recv_errno = errno;
1461
1462 #if defined(ISC_SOCKET_DEBUG)
1463         dump_msg(&msghdr);
1464 #endif
1465
1466         if (cc < 0) {
1467                 if (SOFT_ERROR(recv_errno))
1468                         return (DOIO_SOFT);
1469
1470                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1471                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1472                         socket_log(sock, NULL, IOEVENT,
1473                                    isc_msgcat, ISC_MSGSET_SOCKET,
1474                                    ISC_MSG_DOIORECV,
1475                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1476                                    sock->fd, cc, recv_errno, strbuf);
1477                 }
1478
1479 #define SOFT_OR_HARD(_system, _isc) \
1480         if (recv_errno == _system) { \
1481                 if (sock->connected) { \
1482                         dev->result = _isc; \
1483                         inc_stats(sock->manager->stats, \
1484                                   sock->statsindex[STATID_RECVFAIL]); \
1485                         return (DOIO_HARD); \
1486                 } \
1487                 return (DOIO_SOFT); \
1488         }
1489 #define ALWAYS_HARD(_system, _isc) \
1490         if (recv_errno == _system) { \
1491                 dev->result = _isc; \
1492                 inc_stats(sock->manager->stats, \
1493                           sock->statsindex[STATID_RECVFAIL]); \
1494                 return (DOIO_HARD); \
1495         }
1496
1497                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1498                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1499                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1500                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1501                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
1502                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1503                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1504                 /*
1505                  * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1506                  * errors.
1507                  */
1508 #ifdef EPROTO
1509                 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1510 #endif
1511                 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1512
1513 #undef SOFT_OR_HARD
1514 #undef ALWAYS_HARD
1515
1516                 dev->result = isc__errno2result(recv_errno);
1517                 inc_stats(sock->manager->stats,
1518                           sock->statsindex[STATID_RECVFAIL]);
1519                 return (DOIO_HARD);
1520         }
1521
1522         /*
1523          * On TCP, zero length reads indicate EOF, while on
1524          * UDP, zero length reads are perfectly valid, although
1525          * strange.
1526          */
1527         if ((sock->type == isc_sockettype_tcp) && (cc == 0))
1528                 return (DOIO_EOF);
1529
1530         if (sock->type == isc_sockettype_udp) {
1531                 dev->address.length = msghdr.msg_namelen;
1532                 if (isc_sockaddr_getport(&dev->address) == 0) {
1533                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1534                                 socket_log(sock, &dev->address, IOEVENT,
1535                                            isc_msgcat, ISC_MSGSET_SOCKET,
1536                                            ISC_MSG_ZEROPORT,
1537                                            "dropping source port zero packet");
1538                         }
1539                         return (DOIO_SOFT);
1540                 }
1541         }
1542
1543         socket_log(sock, &dev->address, IOEVENT,
1544                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1545                    "packet received correctly");
1546
1547         /*
1548          * Overflow bit detection.  If we received MORE bytes than we should,
1549          * this indicates an overflow situation.  Set the flag in the
1550          * dev entry and adjust how much we read by one.
1551          */
1552 #ifdef ISC_NET_RECVOVERFLOW
1553         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1554                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1555                 cc--;
1556         }
1557 #endif
1558
1559         /*
1560          * If there are control messages attached, run through them and pull
1561          * out the interesting bits.
1562          */
1563         if (sock->type == isc_sockettype_udp)
1564                 process_cmsg(sock, &msghdr, dev);
1565
1566         /*
1567          * update the buffers (if any) and the i/o count
1568          */
1569         dev->n += cc;
1570         actual_count = cc;
1571         buffer = ISC_LIST_HEAD(dev->bufferlist);
1572         while (buffer != NULL && actual_count > 0U) {
1573                 REQUIRE(ISC_BUFFER_VALID(buffer));
1574                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1575                         actual_count -= isc_buffer_availablelength(buffer);
1576                         isc_buffer_add(buffer,
1577                                        isc_buffer_availablelength(buffer));
1578                 } else {
1579                         isc_buffer_add(buffer, actual_count);
1580                         actual_count = 0;
1581                         break;
1582                 }
1583                 buffer = ISC_LIST_NEXT(buffer, link);
1584                 if (buffer == NULL) {
1585                         INSIST(actual_count == 0U);
1586                 }
1587         }
1588
1589         /*
1590          * If we read less than we expected, update counters,
1591          * and let the upper layer poke the descriptor.
1592          */
1593         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1594                 return (DOIO_SOFT);
1595
1596         /*
1597          * Full reads are posted, or partials if partials are ok.
1598          */
1599         dev->result = ISC_R_SUCCESS;
1600         return (DOIO_SUCCESS);
1601 }
1602
1603 /*
1604  * Returns:
1605  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1606  *                      ISC_R_SUCCESS.
1607  *
1608  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1609  *                      dev->result contains the appropriate error.
1610  *
1611  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1612  *                      event was sent.  The operation should be retried.
1613  *
1614  *      No other return values are possible.
1615  */
1616 static int
1617 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1618         int cc;
1619         struct iovec iov[MAXSCATTERGATHER_SEND];
1620         size_t write_count;
1621         struct msghdr msghdr;
1622         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1623         int attempts = 0;
1624         int send_errno;
1625         char strbuf[ISC_STRERRORSIZE];
1626
1627         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1628
1629  resend:
1630         cc = sendmsg(sock->fd, &msghdr, 0);
1631         send_errno = errno;
1632
1633         /*
1634          * Check for error or block condition.
1635          */
1636         if (cc < 0) {
1637                 if (send_errno == EINTR && ++attempts < NRETRIES)
1638                         goto resend;
1639
1640                 if (SOFT_ERROR(send_errno))
1641                         return (DOIO_SOFT);
1642
1643 #define SOFT_OR_HARD(_system, _isc) \
1644         if (send_errno == _system) { \
1645                 if (sock->connected) { \
1646                         dev->result = _isc; \
1647                         inc_stats(sock->manager->stats, \
1648                                   sock->statsindex[STATID_SENDFAIL]); \
1649                         return (DOIO_HARD); \
1650                 } \
1651                 return (DOIO_SOFT); \
1652         }
1653 #define ALWAYS_HARD(_system, _isc) \
1654         if (send_errno == _system) { \
1655                 dev->result = _isc; \
1656                 inc_stats(sock->manager->stats, \
1657                           sock->statsindex[STATID_SENDFAIL]); \
1658                 return (DOIO_HARD); \
1659         }
1660
1661                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1662                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1663                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1664                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1665                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1666 #ifdef EHOSTDOWN
1667                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1668 #endif
1669                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1670                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1671                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1672                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1673                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1674
1675 #undef SOFT_OR_HARD
1676 #undef ALWAYS_HARD
1677
1678                 /*
1679                  * The other error types depend on whether or not the
1680                  * socket is UDP or TCP.  If it is UDP, some errors
1681                  * that we expect to be fatal under TCP are merely
1682                  * annoying, and are really soft errors.
1683                  *
1684                  * However, these soft errors are still returned as
1685                  * a status.
1686                  */
1687                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1688                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1689                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1690                                  addrbuf, strbuf);
1691                 dev->result = isc__errno2result(send_errno);
1692                 inc_stats(sock->manager->stats,
1693                           sock->statsindex[STATID_SENDFAIL]);
1694                 return (DOIO_HARD);
1695         }
1696
1697         if (cc == 0) {
1698                 inc_stats(sock->manager->stats,
1699                           sock->statsindex[STATID_SENDFAIL]);
1700                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1701                                  "doio_send: send() %s 0",
1702                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1703                                                 ISC_MSG_RETURNED, "returned"));
1704         }
1705
1706         /*
1707          * If we write less than we expected, update counters, poke.
1708          */
1709         dev->n += cc;
1710         if ((size_t)cc != write_count)
1711                 return (DOIO_SOFT);
1712
1713         /*
1714          * Exactly what we wanted to write.  We're done with this
1715          * entry.  Post its completion event.
1716          */
1717         dev->result = ISC_R_SUCCESS;
1718         return (DOIO_SUCCESS);
1719 }
1720
1721 /*
1722  * Kill.
1723  *
1724  * Caller must ensure that the socket is not locked and no external
1725  * references exist.
1726  */
1727 static void
1728 closesocket(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) {
1729         isc_sockettype_t type = sock->type;
1730         int lockid = FDLOCK_ID(fd);
1731
1732         /*
1733          * No one has this socket open, so the watcher doesn't have to be
1734          * poked, and the socket doesn't have to be locked.
1735          */
1736         LOCK(&manager->fdlock[lockid]);
1737         manager->fds[fd] = NULL;
1738         if (type == isc_sockettype_fdwatch)
1739                 manager->fdstate[fd] = CLOSED;
1740         else
1741                 manager->fdstate[fd] = CLOSE_PENDING;
1742         UNLOCK(&manager->fdlock[lockid]);
1743         if (type == isc_sockettype_fdwatch) {
1744                 /*
1745                  * The caller may close the socket once this function returns,
1746                  * and `fd' may be reassigned for a new socket.  So we do
1747                  * unwatch_fd() here, rather than defer it via select_poke().
1748                  * Note: this may complicate data protection among threads and
1749                  * may reduce performance due to additional locks.  One way to
1750                  * solve this would be to dup() the watched descriptor, but we
1751                  * take a simpler approach at this moment.
1752                  */
1753                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1754                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1755         } else
1756                 select_poke(manager, fd, SELECT_POKE_CLOSE);
1757
1758         inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]);
1759
1760         /*
1761          * update manager->maxfd here (XXX: this should be implemented more
1762          * efficiently)
1763          */
1764 #ifdef USE_SELECT
1765         LOCK(&manager->lock);
1766         if (manager->maxfd == fd) {
1767                 int i;
1768
1769                 manager->maxfd = 0;
1770                 for (i = fd - 1; i >= 0; i--) {
1771                         lockid = FDLOCK_ID(i);
1772
1773                         LOCK(&manager->fdlock[lockid]);
1774                         if (manager->fdstate[i] == MANAGED) {
1775                                 manager->maxfd = i;
1776                                 UNLOCK(&manager->fdlock[lockid]);
1777                                 break;
1778                         }
1779                         UNLOCK(&manager->fdlock[lockid]);
1780                 }
1781 #ifdef ISC_PLATFORM_USETHREADS
1782                 if (manager->maxfd < manager->pipe_fds[0])
1783                         manager->maxfd = manager->pipe_fds[0];
1784 #endif
1785         }
1786         UNLOCK(&manager->lock);
1787 #endif  /* USE_SELECT */
1788 }
1789
1790 static void
1791 destroy(isc_socket_t **sockp) {
1792         int fd;
1793         isc_socket_t *sock = *sockp;
1794         isc_socketmgr_t *manager = sock->manager;
1795
1796         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1797                    ISC_MSG_DESTROYING, "destroying");
1798
1799         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1800         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1801         INSIST(ISC_LIST_EMPTY(sock->send_list));
1802         INSIST(sock->connect_ev == NULL);
1803         REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
1804
1805         if (sock->fd >= 0) {
1806                 fd = sock->fd;
1807                 sock->fd = -1;
1808                 closesocket(manager, sock, fd);
1809         }
1810
1811         LOCK(&manager->lock);
1812
1813         ISC_LIST_UNLINK(manager->socklist, sock, link);
1814
1815 #ifdef ISC_PLATFORM_USETHREADS
1816         if (ISC_LIST_EMPTY(manager->socklist))
1817                 SIGNAL(&manager->shutdown_ok);
1818 #endif /* ISC_PLATFORM_USETHREADS */
1819
1820         UNLOCK(&manager->lock);
1821
1822         free_socket(sockp);
1823 }
1824
1825 static isc_result_t
1826 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1827                 isc_socket_t **socketp)
1828 {
1829         isc_socket_t *sock;
1830         isc_result_t result;
1831         ISC_SOCKADDR_LEN_T cmsgbuflen;
1832
1833         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1834
1835         if (sock == NULL)
1836                 return (ISC_R_NOMEMORY);
1837
1838         result = ISC_R_UNEXPECTED;
1839
1840         sock->magic = 0;
1841         sock->references = 0;
1842
1843         sock->manager = manager;
1844         sock->type = type;
1845         sock->fd = -1;
1846         sock->statsindex = NULL;
1847
1848         ISC_LINK_INIT(sock, link);
1849
1850         sock->recvcmsgbuf = NULL;
1851         sock->sendcmsgbuf = NULL;
1852
1853         /*
1854          * set up cmsg buffers
1855          */
1856         cmsgbuflen = 0;
1857 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1858         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1859 #endif
1860 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1861         cmsgbuflen += cmsg_space(sizeof(struct timeval));
1862 #endif
1863         sock->recvcmsgbuflen = cmsgbuflen;
1864         if (sock->recvcmsgbuflen != 0U) {
1865                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1866                 if (sock->recvcmsgbuf == NULL)
1867                         goto error;
1868         }
1869
1870         cmsgbuflen = 0;
1871 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1872         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1873 #endif
1874         sock->sendcmsgbuflen = cmsgbuflen;
1875         if (sock->sendcmsgbuflen != 0U) {
1876                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1877                 if (sock->sendcmsgbuf == NULL)
1878                         goto error;
1879         }
1880
1881         memset(sock->name, 0, sizeof(sock->name));
1882
1883         memset(sock->name, 0, sizeof(sock->name));
1884         sock->tag = NULL;
1885
1886         /*
1887          * set up list of readers and writers to be initially empty
1888          */
1889         ISC_LIST_INIT(sock->recv_list);
1890         ISC_LIST_INIT(sock->send_list);
1891         ISC_LIST_INIT(sock->accept_list);
1892         sock->connect_ev = NULL;
1893         sock->pending_recv = 0;
1894         sock->pending_send = 0;
1895         sock->pending_accept = 0;
1896         sock->listener = 0;
1897         sock->connected = 0;
1898         sock->connecting = 0;
1899         sock->bound = 0;
1900
1901         /*
1902          * initialize the lock
1903          */
1904         result = isc_mutex_init(&sock->lock);
1905         if (result != ISC_R_SUCCESS) {
1906                 sock->magic = 0;
1907                 goto error;
1908         }
1909
1910         /*
1911          * Initialize readable and writable events
1912          */
1913         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1914                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1915                        NULL, sock, sock, NULL, NULL);
1916         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1917                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1918                        NULL, sock, sock, NULL, NULL);
1919
1920         sock->magic = SOCKET_MAGIC;
1921         *socketp = sock;
1922
1923         return (ISC_R_SUCCESS);
1924
1925  error:
1926         if (sock->recvcmsgbuf != NULL)
1927                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1928                             sock->recvcmsgbuflen);
1929         if (sock->sendcmsgbuf != NULL)
1930                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1931                             sock->sendcmsgbuflen);
1932         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1933
1934         return (result);
1935 }
1936
1937 /*
1938  * This event requires that the various lists be empty, that the reference
1939  * count be 1, and that the magic number is valid.  The other socket bits,
1940  * like the lock, must be initialized as well.  The fd associated must be
1941  * marked as closed, by setting it to -1 on close, or this routine will
1942  * also close the socket.
1943  */
1944 static void
1945 free_socket(isc_socket_t **socketp) {
1946         isc_socket_t *sock = *socketp;
1947
1948         INSIST(sock->references == 0);
1949         INSIST(VALID_SOCKET(sock));
1950         INSIST(!sock->connecting);
1951         INSIST(!sock->pending_recv);
1952         INSIST(!sock->pending_send);
1953         INSIST(!sock->pending_accept);
1954         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1955         INSIST(ISC_LIST_EMPTY(sock->send_list));
1956         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1957         INSIST(!ISC_LINK_LINKED(sock, link));
1958
1959         if (sock->recvcmsgbuf != NULL)
1960                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1961                             sock->recvcmsgbuflen);
1962         if (sock->sendcmsgbuf != NULL)
1963                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1964                             sock->sendcmsgbuflen);
1965
1966         sock->magic = 0;
1967
1968         DESTROYLOCK(&sock->lock);
1969
1970         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1971
1972         *socketp = NULL;
1973 }
1974
1975 #ifdef SO_BSDCOMPAT
1976 /*
1977  * This really should not be necessary to do.  Having to workout
1978  * which kernel version we are on at run time so that we don't cause
1979  * the kernel to issue a warning about us using a deprecated socket option.
1980  * Such warnings should *never* be on by default in production kernels.
1981  *
1982  * We can't do this a build time because executables are moved between
1983  * machines and hence kernels.
1984  *
1985  * We can't just not set SO_BSDCOMAT because some kernels require it.
1986  */
1987
1988 static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
1989 isc_boolean_t bsdcompat = ISC_TRUE;
1990
1991 static void
1992 clear_bsdcompat(void) {
1993 #ifdef __linux__
1994          struct utsname buf;
1995          char *endp;
1996          long int major;
1997          long int minor;
1998
1999          uname(&buf);    /* Can only fail if buf is bad in Linux. */
2000
2001          /* Paranoia in parsing can be increased, but we trust uname(). */
2002          major = strtol(buf.release, &endp, 10);
2003          if (*endp == '.') {
2004                 minor = strtol(endp+1, &endp, 10);
2005                 if ((major > 2) || ((major == 2) && (minor >= 4))) {
2006                         bsdcompat = ISC_FALSE;
2007                 }
2008          }
2009 #endif /* __linux __ */
2010 }
2011 #endif
2012
2013 static isc_result_t
2014 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
2015         char strbuf[ISC_STRERRORSIZE];
2016         const char *err = "socket";
2017         int tries = 0;
2018 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
2019         int on = 1;
2020 #endif
2021 #if defined(SO_RCVBUF)
2022         ISC_SOCKADDR_LEN_T optlen;
2023         int size;
2024 #endif
2025
2026  again:
2027         switch (sock->type) {
2028         case isc_sockettype_udp:
2029                 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2030                 break;
2031         case isc_sockettype_tcp:
2032                 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2033                 break;
2034         case isc_sockettype_unix:
2035                 sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2036                 break;
2037         case isc_sockettype_fdwatch:
2038                 /*
2039                  * We should not be called for isc_sockettype_fdwatch sockets.
2040                  */
2041                 INSIST(0);
2042                 break;
2043         }
2044         if (sock->fd == -1 && errno == EINTR && tries++ < 42)
2045                 goto again;
2046
2047 #ifdef F_DUPFD
2048         /*
2049          * Leave a space for stdio and TCP to work in.
2050          */
2051         if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2052             sock->fd >= 0 && sock->fd < manager->reserved) {
2053                 int new, tmp;
2054                 new = fcntl(sock->fd, F_DUPFD, manager->reserved);
2055                 tmp = errno;
2056                 (void)close(sock->fd);
2057                 errno = tmp;
2058                 sock->fd = new;
2059                 err = "isc_socket_create: fcntl/reserved";
2060         } else if (sock->fd >= 0 && sock->fd < 20) {
2061                 int new, tmp;
2062                 new = fcntl(sock->fd, F_DUPFD, 20);
2063                 tmp = errno;
2064                 (void)close(sock->fd);
2065                 errno = tmp;
2066                 sock->fd = new;
2067                 err = "isc_socket_create: fcntl";
2068         }
2069 #endif
2070
2071         if (sock->fd >= (int)manager->maxsocks) {
2072                 (void)close(sock->fd);
2073                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2074                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2075                                isc_msgcat, ISC_MSGSET_SOCKET,
2076                                ISC_MSG_TOOMANYFDS,
2077                                "socket: file descriptor exceeds limit (%d/%u)",
2078                                sock->fd, manager->maxsocks);
2079                 return (ISC_R_NORESOURCES);
2080         }
2081
2082         if (sock->fd < 0) {
2083                 switch (errno) {
2084                 case EMFILE:
2085                 case ENFILE:
2086                         isc__strerror(errno, strbuf, sizeof(strbuf));
2087                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2088                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2089                                        isc_msgcat, ISC_MSGSET_SOCKET,
2090                                        ISC_MSG_TOOMANYFDS,
2091                                        "%s: %s", err, strbuf);
2092                         /* fallthrough */
2093                 case ENOBUFS:
2094                         return (ISC_R_NORESOURCES);
2095
2096                 case EPROTONOSUPPORT:
2097                 case EPFNOSUPPORT:
2098                 case EAFNOSUPPORT:
2099                 /*
2100                  * Linux 2.2 (and maybe others) return EINVAL instead of
2101                  * EAFNOSUPPORT.
2102                  */
2103                 case EINVAL:
2104                         return (ISC_R_FAMILYNOSUPPORT);
2105
2106                 default:
2107                         isc__strerror(errno, strbuf, sizeof(strbuf));
2108                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2109                                          "%s() %s: %s", err,
2110                                          isc_msgcat_get(isc_msgcat,
2111                                                         ISC_MSGSET_GENERAL,
2112                                                         ISC_MSG_FAILED,
2113                                                         "failed"),
2114                                          strbuf);
2115                         return (ISC_R_UNEXPECTED);
2116                 }
2117         }
2118
2119         if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
2120                 (void)close(sock->fd);
2121                 return (ISC_R_UNEXPECTED);
2122         }
2123
2124 #ifdef SO_BSDCOMPAT
2125         RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
2126                                   clear_bsdcompat) == ISC_R_SUCCESS);
2127         if (sock->type != isc_sockettype_unix && bsdcompat &&
2128             setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
2129                        (void *)&on, sizeof(on)) < 0) {
2130                 isc__strerror(errno, strbuf, sizeof(strbuf));
2131                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2132                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
2133                                  sock->fd,
2134                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2135                                                 ISC_MSG_FAILED, "failed"),
2136                                  strbuf);
2137                 /* Press on... */
2138         }
2139 #endif
2140
2141 #ifdef SO_NOSIGPIPE
2142         if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
2143                        (void *)&on, sizeof(on)) < 0) {
2144                 isc__strerror(errno, strbuf, sizeof(strbuf));
2145                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2146                                  "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
2147                                  sock->fd,
2148                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2149                                                 ISC_MSG_FAILED, "failed"),
2150                                  strbuf);
2151                 /* Press on... */
2152         }
2153 #endif
2154
2155 #if defined(USE_CMSG) || defined(SO_RCVBUF)
2156         if (sock->type == isc_sockettype_udp) {
2157
2158 #if defined(USE_CMSG)
2159 #if defined(SO_TIMESTAMP)
2160                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
2161                                (void *)&on, sizeof(on)) < 0
2162                     && errno != ENOPROTOOPT) {
2163                         isc__strerror(errno, strbuf, sizeof(strbuf));
2164                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2165                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
2166                                          sock->fd,
2167                                          isc_msgcat_get(isc_msgcat,
2168                                                         ISC_MSGSET_GENERAL,
2169                                                         ISC_MSG_FAILED,
2170                                                         "failed"),
2171                                          strbuf);
2172                         /* Press on... */
2173                 }
2174 #endif /* SO_TIMESTAMP */
2175
2176 #if defined(ISC_PLATFORM_HAVEIPV6)
2177                 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
2178                         /*
2179                          * Warn explicitly because this anomaly can be hidden
2180                          * in usual operation (and unexpectedly appear later).
2181                          */
2182                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2183                                          "No buffer available to receive "
2184                                          "IPv6 destination");
2185                 }
2186 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
2187 #ifdef IPV6_RECVPKTINFO
2188                 /* RFC 3542 */
2189                 if ((sock->pf == AF_INET6)
2190                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2191                                    (void *)&on, sizeof(on)) < 0)) {
2192                         isc__strerror(errno, strbuf, sizeof(strbuf));
2193                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2194                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
2195                                          "%s: %s", sock->fd,
2196                                          isc_msgcat_get(isc_msgcat,
2197                                                         ISC_MSGSET_GENERAL,
2198                                                         ISC_MSG_FAILED,
2199                                                         "failed"),
2200                                          strbuf);
2201                 }
2202 #else
2203                 /* RFC 2292 */
2204                 if ((sock->pf == AF_INET6)
2205                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2206                                    (void *)&on, sizeof(on)) < 0)) {
2207                         isc__strerror(errno, strbuf, sizeof(strbuf));
2208                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2209                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
2210                                          sock->fd,
2211                                          isc_msgcat_get(isc_msgcat,
2212                                                         ISC_MSGSET_GENERAL,
2213                                                         ISC_MSG_FAILED,
2214                                                         "failed"),
2215                                          strbuf);
2216                 }
2217 #endif /* IPV6_RECVPKTINFO */
2218 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
2219 #ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
2220                 /* use minimum MTU */
2221                 if (sock->pf == AF_INET6) {
2222                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
2223                                          IPV6_USE_MIN_MTU,
2224                                          (void *)&on, sizeof(on));
2225                 }
2226 #endif
2227 #endif /* ISC_PLATFORM_HAVEIPV6 */
2228 #endif /* defined(USE_CMSG) */
2229
2230 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2231                 /*
2232                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2233                  */
2234                 if (sock->pf == AF_INET) {
2235                         int action = IP_PMTUDISC_DONT;
2236                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2237                                          &action, sizeof(action));
2238                 }
2239 #endif
2240 #if defined(IP_DONTFRAG)
2241                 /*
2242                  * Turn off Path MTU discovery on IPv4/UDP sockets.
2243                  */
2244                 if (sock->pf == AF_INET) {
2245                         int off = 0;
2246                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2247                                          &off, sizeof(off));
2248                 }
2249 #endif
2250
2251 #if defined(SO_RCVBUF)
2252                 optlen = sizeof(size);
2253                 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2254                                (void *)&size, &optlen) >= 0 &&
2255                      size < RCVBUFSIZE) {
2256                         size = RCVBUFSIZE;
2257                         if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2258                                        (void *)&size, sizeof(size)) == -1) {
2259                                 isc__strerror(errno, strbuf, sizeof(strbuf));
2260                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2261                                         "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
2262                                         sock->fd, size,
2263                                         isc_msgcat_get(isc_msgcat,
2264                                                        ISC_MSGSET_GENERAL,
2265                                                        ISC_MSG_FAILED,
2266                                                        "failed"),
2267                                         strbuf);
2268                         }
2269                 }
2270 #endif
2271         }
2272 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
2273
2274         inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2275
2276         return (ISC_R_SUCCESS);
2277 }
2278
2279 /*%
2280  * Create a new 'type' socket managed by 'manager'.  Events
2281  * will be posted to 'task' and when dispatched 'action' will be
2282  * called with 'arg' as the arg value.  The new socket is returned
2283  * in 'socketp'.
2284  */
2285 isc_result_t
2286 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2287                   isc_socket_t **socketp)
2288 {
2289         isc_socket_t *sock = NULL;
2290         isc_result_t result;
2291         int lockid;
2292
2293         REQUIRE(VALID_MANAGER(manager));
2294         REQUIRE(socketp != NULL && *socketp == NULL);
2295         REQUIRE(type != isc_sockettype_fdwatch);
2296
2297         result = allocate_socket(manager, type, &sock);
2298         if (result != ISC_R_SUCCESS)
2299                 return (result);
2300
2301         switch (sock->type) {
2302         case isc_sockettype_udp:
2303                 sock->statsindex =
2304                         (pf == AF_INET) ? upd4statsindex : upd6statsindex;
2305                 break;
2306         case isc_sockettype_tcp:
2307                 sock->statsindex =
2308                         (pf == AF_INET) ? tcp4statsindex : tcp6statsindex;
2309                 break;
2310         case isc_sockettype_unix:
2311                 sock->statsindex = unixstatsindex;
2312                 break;
2313         default:
2314                 INSIST(0);
2315         }
2316
2317         sock->pf = pf;
2318         result = opensocket(manager, sock);
2319         if (result != ISC_R_SUCCESS) {
2320                 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2321                 free_socket(&sock);
2322                 return (result);
2323         }
2324
2325         sock->references = 1;
2326         *socketp = sock;
2327
2328         /*
2329          * Note we don't have to lock the socket like we normally would because
2330          * there are no external references to it yet.
2331          */
2332
2333         lockid = FDLOCK_ID(sock->fd);
2334         LOCK(&manager->fdlock[lockid]);
2335         manager->fds[sock->fd] = sock;
2336         manager->fdstate[sock->fd] = MANAGED;
2337 #ifdef USE_DEVPOLL
2338         INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2339                sock->manager->fdpollinfo[sock->fd].want_write == 0);
2340 #endif
2341         UNLOCK(&manager->fdlock[lockid]);
2342
2343         LOCK(&manager->lock);
2344         ISC_LIST_APPEND(manager->socklist, sock, link);
2345 #ifdef USE_SELECT
2346         if (manager->maxfd < sock->fd)
2347                 manager->maxfd = sock->fd;
2348 #endif
2349         UNLOCK(&manager->lock);
2350
2351         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2352                    ISC_MSG_CREATED, "created");
2353
2354         return (ISC_R_SUCCESS);
2355 }
2356
2357 isc_result_t
2358 isc_socket_open(isc_socket_t *sock) {
2359         isc_result_t result;
2360
2361         REQUIRE(VALID_SOCKET(sock));
2362
2363         LOCK(&sock->lock);
2364         REQUIRE(sock->references == 1);
2365         REQUIRE(sock->type != isc_sockettype_fdwatch);
2366         UNLOCK(&sock->lock);
2367         /*
2368          * We don't need to retain the lock hereafter, since no one else has
2369          * this socket.
2370          */
2371         REQUIRE(sock->fd == -1);
2372
2373         result = opensocket(sock->manager, sock);
2374         if (result != ISC_R_SUCCESS)
2375                 sock->fd = -1;
2376
2377         if (result == ISC_R_SUCCESS) {
2378                 int lockid = FDLOCK_ID(sock->fd);
2379
2380                 LOCK(&sock->manager->fdlock[lockid]);
2381                 sock->manager->fds[sock->fd] = sock;
2382                 sock->manager->fdstate[sock->fd] = MANAGED;
2383 #ifdef USE_DEVPOLL
2384                 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2385                        sock->manager->fdpollinfo[sock->fd].want_write == 0);
2386 #endif
2387                 UNLOCK(&sock->manager->fdlock[lockid]);
2388
2389 #ifdef USE_SELECT
2390                 LOCK(&sock->manager->lock);
2391                 if (sock->manager->maxfd < sock->fd)
2392                         sock->manager->maxfd = sock->fd;
2393                 UNLOCK(&sock->manager->lock);
2394 #endif
2395         }
2396
2397         return (result);
2398 }
2399
2400 /*
2401  * Create a new 'type' socket managed by 'manager'.  Events
2402  * will be posted to 'task' and when dispatched 'action' will be
2403  * called with 'arg' as the arg value.  The new socket is returned
2404  * in 'socketp'.
2405  */
2406 isc_result_t
2407 isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
2408                          isc_sockfdwatch_t callback, void *cbarg,
2409                          isc_task_t *task, isc_socket_t **socketp)
2410 {
2411         isc_socket_t *sock = NULL;
2412         isc_result_t result;
2413         int lockid;
2414
2415         REQUIRE(VALID_MANAGER(manager));
2416         REQUIRE(socketp != NULL && *socketp == NULL);
2417
2418         result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
2419         if (result != ISC_R_SUCCESS)
2420                 return (result);
2421
2422         sock->fd = fd;
2423         sock->fdwatcharg = cbarg;
2424         sock->fdwatchcb = callback;
2425         sock->fdwatchflags = flags;
2426         sock->fdwatchtask = task;
2427         sock->statsindex = fdwatchstatsindex;
2428
2429         sock->references = 1;
2430         *socketp = sock;
2431
2432         /*
2433          * Note we don't have to lock the socket like we normally would because
2434          * there are no external references to it yet.
2435          */
2436
2437         lockid = FDLOCK_ID(sock->fd);
2438         LOCK(&manager->fdlock[lockid]);
2439         manager->fds[sock->fd] = sock;
2440         manager->fdstate[sock->fd] = MANAGED;
2441         UNLOCK(&manager->fdlock[lockid]);
2442
2443         LOCK(&manager->lock);
2444         ISC_LIST_APPEND(manager->socklist, sock, link);
2445 #ifdef USE_SELECT
2446         if (manager->maxfd < sock->fd)
2447                 manager->maxfd = sock->fd;
2448 #endif
2449         UNLOCK(&manager->lock);
2450
2451         if (flags & ISC_SOCKFDWATCH_READ)
2452                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2453         if (flags & ISC_SOCKFDWATCH_WRITE)
2454                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2455
2456         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2457                    ISC_MSG_CREATED, "fdwatch-created");
2458
2459         return (ISC_R_SUCCESS);
2460 }
2461
2462 /*
2463  * Attach to a socket.  Caller must explicitly detach when it is done.
2464  */
2465 void
2466 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2467         REQUIRE(VALID_SOCKET(sock));
2468         REQUIRE(socketp != NULL && *socketp == NULL);
2469
2470         LOCK(&sock->lock);
2471         sock->references++;
2472         UNLOCK(&sock->lock);
2473
2474         *socketp = sock;
2475 }
2476
2477 /*
2478  * Dereference a socket.  If this is the last reference to it, clean things
2479  * up by destroying the socket.
2480  */
2481 void
2482 isc_socket_detach(isc_socket_t **socketp) {
2483         isc_socket_t *sock;
2484         isc_boolean_t kill_socket = ISC_FALSE;
2485
2486         REQUIRE(socketp != NULL);
2487         sock = *socketp;
2488         REQUIRE(VALID_SOCKET(sock));
2489
2490         LOCK(&sock->lock);
2491         REQUIRE(sock->references > 0);
2492         sock->references--;
2493         if (sock->references == 0)
2494                 kill_socket = ISC_TRUE;
2495         UNLOCK(&sock->lock);
2496
2497         if (kill_socket)
2498                 destroy(&sock);
2499
2500         *socketp = NULL;
2501 }
2502
2503 isc_result_t
2504 isc_socket_close(isc_socket_t *sock) {
2505         int fd;
2506         isc_socketmgr_t *manager;
2507         isc_sockettype_t type;
2508
2509         REQUIRE(VALID_SOCKET(sock));
2510
2511         LOCK(&sock->lock);
2512
2513         REQUIRE(sock->references == 1);
2514         REQUIRE(sock->type != isc_sockettype_fdwatch);
2515         REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2516
2517         INSIST(!sock->connecting);
2518         INSIST(!sock->pending_recv);
2519         INSIST(!sock->pending_send);
2520         INSIST(!sock->pending_accept);
2521         INSIST(ISC_LIST_EMPTY(sock->recv_list));
2522         INSIST(ISC_LIST_EMPTY(sock->send_list));
2523         INSIST(ISC_LIST_EMPTY(sock->accept_list));
2524         INSIST(sock->connect_ev == NULL);
2525
2526         manager = sock->manager;
2527         type = sock->type;
2528         fd = sock->fd;
2529         sock->fd = -1;
2530         memset(sock->name, 0, sizeof(sock->name));
2531         sock->tag = NULL;
2532         sock->listener = 0;
2533         sock->connected = 0;
2534         sock->connecting = 0;
2535         sock->bound = 0;
2536         isc_sockaddr_any(&sock->peer_address);
2537
2538         UNLOCK(&sock->lock);
2539
2540         closesocket(manager, sock, fd);
2541
2542         return (ISC_R_SUCCESS);
2543 }
2544
2545 /*
2546  * I/O is possible on a given socket.  Schedule an event to this task that
2547  * will call an internal function to do the I/O.  This will charge the
2548  * task with the I/O operation and let our select loop handler get back
2549  * to doing something real as fast as possible.
2550  *
2551  * The socket and manager must be locked before calling this function.
2552  */
2553 static void
2554 dispatch_recv(isc_socket_t *sock) {
2555         intev_t *iev;
2556         isc_socketevent_t *ev;
2557         isc_task_t *sender;
2558
2559         INSIST(!sock->pending_recv);
2560
2561         if (sock->type != isc_sockettype_fdwatch) {
2562                 ev = ISC_LIST_HEAD(sock->recv_list);
2563                 if (ev == NULL)
2564                         return;
2565                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2566                            "dispatch_recv:  event %p -> task %p",
2567                            ev, ev->ev_sender);
2568                 sender = ev->ev_sender;
2569         } else {
2570                 sender = sock->fdwatchtask;
2571         }
2572
2573         sock->pending_recv = 1;
2574         iev = &sock->readable_ev;
2575
2576         sock->references++;
2577         iev->ev_sender = sock;
2578         if (sock->type == isc_sockettype_fdwatch)
2579                 iev->ev_action = internal_fdwatch_read;
2580         else
2581                 iev->ev_action = internal_recv;
2582         iev->ev_arg = sock;
2583
2584         isc_task_send(sender, (isc_event_t **)&iev);
2585 }
2586
2587 static void
2588 dispatch_send(isc_socket_t *sock) {
2589         intev_t *iev;
2590         isc_socketevent_t *ev;
2591         isc_task_t *sender;
2592
2593         INSIST(!sock->pending_send);
2594
2595         if (sock->type != isc_sockettype_fdwatch) {
2596                 ev = ISC_LIST_HEAD(sock->send_list);
2597                 if (ev == NULL)
2598                         return;
2599                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2600                            "dispatch_send:  event %p -> task %p",
2601                            ev, ev->ev_sender);
2602                 sender = ev->ev_sender;
2603         } else {
2604                 sender = sock->fdwatchtask;
2605         }
2606
2607         sock->pending_send = 1;
2608         iev = &sock->writable_ev;
2609
2610         sock->references++;
2611         iev->ev_sender = sock;
2612         if (sock->type == isc_sockettype_fdwatch)
2613                 iev->ev_action = internal_fdwatch_write;
2614         else
2615                 iev->ev_action = internal_send;
2616         iev->ev_arg = sock;
2617
2618         isc_task_send(sender, (isc_event_t **)&iev);
2619 }
2620
2621 /*
2622  * Dispatch an internal accept event.
2623  */
2624 static void
2625 dispatch_accept(isc_socket_t *sock) {
2626         intev_t *iev;
2627         isc_socket_newconnev_t *ev;
2628
2629         INSIST(!sock->pending_accept);
2630
2631         /*
2632          * Are there any done events left, or were they all canceled
2633          * before the manager got the socket lock?
2634          */
2635         ev = ISC_LIST_HEAD(sock->accept_list);
2636         if (ev == NULL)
2637                 return;
2638
2639         sock->pending_accept = 1;
2640         iev = &sock->readable_ev;
2641
2642         sock->references++;  /* keep socket around for this internal event */
2643         iev->ev_sender = sock;
2644         iev->ev_action = internal_accept;
2645         iev->ev_arg = sock;
2646
2647         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2648 }
2649
2650 static void
2651 dispatch_connect(isc_socket_t *sock) {
2652         intev_t *iev;
2653         isc_socket_connev_t *ev;
2654
2655         iev = &sock->writable_ev;
2656
2657         ev = sock->connect_ev;
2658         INSIST(ev != NULL); /* XXX */
2659
2660         INSIST(sock->connecting);
2661
2662         sock->references++;  /* keep socket around for this internal event */
2663         iev->ev_sender = sock;
2664         iev->ev_action = internal_connect;
2665         iev->ev_arg = sock;
2666
2667         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2668 }
2669
2670 /*
2671  * Dequeue an item off the given socket's read queue, set the result code
2672  * in the done event to the one provided, and send it to the task it was
2673  * destined for.
2674  *
2675  * If the event to be sent is on a list, remove it before sending.  If
2676  * asked to, send and detach from the socket as well.
2677  *
2678  * Caller must have the socket locked if the event is attached to the socket.
2679  */
2680 static void
2681 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2682         isc_task_t *task;
2683
2684         task = (*dev)->ev_sender;
2685
2686         (*dev)->ev_sender = sock;
2687
2688         if (ISC_LINK_LINKED(*dev, ev_link))
2689                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2690
2691         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2692             == ISC_SOCKEVENTATTR_ATTACHED)
2693                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2694         else
2695                 isc_task_send(task, (isc_event_t **)dev);
2696 }
2697
2698 /*
2699  * See comments for send_recvdone_event() above.
2700  *
2701  * Caller must have the socket locked if the event is attached to the socket.
2702  */
2703 static void
2704 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2705         isc_task_t *task;
2706
2707         INSIST(dev != NULL && *dev != NULL);
2708
2709         task = (*dev)->ev_sender;
2710         (*dev)->ev_sender = sock;
2711
2712         if (ISC_LINK_LINKED(*dev, ev_link))
2713                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2714
2715         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
2716             == ISC_SOCKEVENTATTR_ATTACHED)
2717                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
2718         else
2719                 isc_task_send(task, (isc_event_t **)dev);
2720 }
2721
2722 /*
2723  * Call accept() on a socket, to get the new file descriptor.  The listen
2724  * socket is used as a prototype to create a new isc_socket_t.  The new
2725  * socket has one outstanding reference.  The task receiving the event
2726  * will be detached from just after the event is delivered.
2727  *
2728  * On entry to this function, the event delivered is the internal
2729  * readable event, and the first item on the accept_list should be
2730  * the done event we want to send.  If the list is empty, this is a no-op,
2731  * so just unlock and return.
2732  */
2733 static void
2734 internal_accept(isc_task_t *me, isc_event_t *ev) {
2735         isc_socket_t *sock;
2736         isc_socketmgr_t *manager;
2737         isc_socket_newconnev_t *dev;
2738         isc_task_t *task;
2739         ISC_SOCKADDR_LEN_T addrlen;
2740         int fd;
2741         isc_result_t result = ISC_R_SUCCESS;
2742         char strbuf[ISC_STRERRORSIZE];
2743         const char *err = "accept";
2744
2745         UNUSED(me);
2746
2747         sock = ev->ev_sender;
2748         INSIST(VALID_SOCKET(sock));
2749
2750         LOCK(&sock->lock);
2751         socket_log(sock, NULL, TRACE,
2752                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2753                    "internal_accept called, locked socket");
2754
2755         manager = sock->manager;
2756         INSIST(VALID_MANAGER(manager));
2757
2758         INSIST(sock->listener);
2759         INSIST(sock->pending_accept == 1);
2760         sock->pending_accept = 0;
2761
2762         INSIST(sock->references > 0);
2763         sock->references--;  /* the internal event is done with this socket */
2764         if (sock->references == 0) {
2765                 UNLOCK(&sock->lock);
2766                 destroy(&sock);
2767                 return;
2768         }
2769
2770         /*
2771          * Get the first item off the accept list.
2772          * If it is empty, unlock the socket and return.
2773          */
2774         dev = ISC_LIST_HEAD(sock->accept_list);
2775         if (dev == NULL) {
2776                 UNLOCK(&sock->lock);
2777                 return;
2778         }
2779
2780         /*
2781          * Try to accept the new connection.  If the accept fails with
2782          * EAGAIN or EINTR, simply poke the watcher to watch this socket
2783          * again.  Also ignore ECONNRESET, which has been reported to
2784          * be spuriously returned on Linux 2.2.19 although it is not
2785          * a documented error for accept().  ECONNABORTED has been
2786          * reported for Solaris 8.  The rest are thrown in not because
2787          * we have seen them but because they are ignored by other
2788          * daemons such as BIND 8 and Apache.
2789          */
2790
2791         addrlen = sizeof(dev->newsocket->peer_address.type);
2792         memset(&dev->newsocket->peer_address.type, 0, addrlen);
2793         fd = accept(sock->fd, &dev->newsocket->peer_address.type.sa,
2794                     (void *)&addrlen);
2795
2796 #ifdef F_DUPFD
2797         /*
2798          * Leave a space for stdio to work in.
2799          */
2800         if (fd >= 0 && fd < 20) {
2801                 int new, tmp;
2802                 new = fcntl(fd, F_DUPFD, 20);
2803                 tmp = errno;
2804                 (void)close(fd);
2805                 errno = tmp;
2806                 fd = new;
2807                 err = "accept/fcntl";
2808         }
2809 #endif
2810
2811         if (fd < 0) {
2812                 if (SOFT_ERROR(errno))
2813                         goto soft_error;
2814                 switch (errno) {
2815                 case ENFILE:
2816                 case EMFILE:
2817                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2818                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2819                                        isc_msgcat, ISC_MSGSET_SOCKET,
2820                                        ISC_MSG_TOOMANYFDS,
2821                                        "%s: too many open file descriptors",
2822                                        err);
2823                         goto soft_error;
2824
2825                 case ENOBUFS:
2826                 case ENOMEM:
2827                 case ECONNRESET:
2828                 case ECONNABORTED:
2829                 case EHOSTUNREACH:
2830                 case EHOSTDOWN:
2831                 case ENETUNREACH:
2832                 case ENETDOWN:
2833                 case ECONNREFUSED:
2834 #ifdef EPROTO
2835                 case EPROTO:
2836 #endif
2837 #ifdef ENONET
2838                 case ENONET:
2839 #endif
2840                         goto soft_error;
2841                 default:
2842                         break;
2843                 }
2844                 isc__strerror(errno, strbuf, sizeof(strbuf));
2845                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2846                                  "internal_accept: %s() %s: %s", err,
2847                                  isc_msgcat_get(isc_msgcat,
2848                                                 ISC_MSGSET_GENERAL,
2849                                                 ISC_MSG_FAILED,
2850                                                 "failed"),
2851                                  strbuf);
2852                 fd = -1;
2853                 result = ISC_R_UNEXPECTED;
2854         } else {
2855                 if (addrlen == 0U) {
2856                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2857                                          "internal_accept(): "
2858                                          "accept() failed to return "
2859                                          "remote address");
2860
2861                         (void)close(fd);
2862                         goto soft_error;
2863                 } else if (dev->newsocket->peer_address.type.sa.sa_family !=
2864                            sock->pf)
2865                 {
2866                         UNEXPECTED_ERROR(__FILE__, __LINE__,
2867                                          "internal_accept(): "
2868                                          "accept() returned peer address "
2869                                          "family %u (expected %u)",
2870                                          dev->newsocket->peer_address.
2871                                          type.sa.sa_family,
2872                                          sock->pf);
2873                         (void)close(fd);
2874                         goto soft_error;
2875                 } else if (fd >= (int)manager->maxsocks) {
2876                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2877                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2878                                        isc_msgcat, ISC_MSGSET_SOCKET,
2879                                        ISC_MSG_TOOMANYFDS,
2880                                        "accept: "
2881                                        "file descriptor exceeds limit (%d/%u)",
2882                                        fd, manager->maxsocks);
2883                         (void)close(fd);
2884                         goto soft_error;
2885                 }
2886         }
2887
2888         if (fd != -1) {
2889                 dev->newsocket->peer_address.length = addrlen;
2890                 dev->newsocket->pf = sock->pf;
2891         }
2892
2893         /*
2894          * Pull off the done event.
2895          */
2896         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2897
2898         /*
2899          * Poke watcher if there are more pending accepts.
2900          */
2901         if (!ISC_LIST_EMPTY(sock->accept_list))
2902                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2903
2904         UNLOCK(&sock->lock);
2905
2906         if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
2907                 (void)close(fd);
2908                 fd = -1;
2909                 result = ISC_R_UNEXPECTED;
2910         }
2911
2912         /*
2913          * -1 means the new socket didn't happen.
2914          */
2915         if (fd != -1) {
2916                 int lockid = FDLOCK_ID(fd);
2917
2918                 LOCK(&manager->fdlock[lockid]);
2919                 manager->fds[fd] = dev->newsocket;
2920                 manager->fdstate[fd] = MANAGED;
2921                 UNLOCK(&manager->fdlock[lockid]);
2922
2923                 LOCK(&manager->lock);
2924                 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
2925
2926                 dev->newsocket->fd = fd;
2927                 dev->newsocket->bound = 1;
2928                 dev->newsocket->connected = 1;
2929
2930                 /*
2931                  * Save away the remote address
2932                  */
2933                 dev->address = dev->newsocket->peer_address;
2934
2935 #ifdef USE_SELECT
2936                 if (manager->maxfd < fd)
2937                         manager->maxfd = fd;
2938 #endif
2939
2940                 socket_log(sock, &dev->newsocket->peer_address, CREATION,
2941                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2942                            "accepted connection, new socket %p",
2943                            dev->newsocket);
2944
2945                 UNLOCK(&manager->lock);
2946
2947                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
2948         } else {
2949                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2950                 dev->newsocket->references--;
2951                 free_socket(&dev->newsocket);
2952         }
2953
2954         /*
2955          * Fill in the done event details and send it off.
2956          */
2957         dev->result = result;
2958         task = dev->ev_sender;
2959         dev->ev_sender = sock;
2960
2961         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2962         return;
2963
2964  soft_error:
2965         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2966         UNLOCK(&sock->lock);
2967
2968         inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
2969         return;
2970 }
2971
2972 static void
2973 internal_recv(isc_task_t *me, isc_event_t *ev) {
2974         isc_socketevent_t *dev;
2975         isc_socket_t *sock;
2976
2977         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
2978
2979         sock = ev->ev_sender;
2980         INSIST(VALID_SOCKET(sock));
2981
2982         LOCK(&sock->lock);
2983         socket_log(sock, NULL, IOEVENT,
2984                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2985                    "internal_recv: task %p got event %p", me, ev);
2986
2987         INSIST(sock->pending_recv == 1);
2988         sock->pending_recv = 0;
2989
2990         INSIST(sock->references > 0);
2991         sock->references--;  /* the internal event is done with this socket */
2992         if (sock->references == 0) {
2993                 UNLOCK(&sock->lock);
2994                 destroy(&sock);
2995                 return;
2996         }
2997
2998         /*
2999          * Try to do as much I/O as possible on this socket.  There are no
3000          * limits here, currently.
3001          */
3002         dev = ISC_LIST_HEAD(sock->recv_list);
3003         while (dev != NULL) {
3004                 switch (doio_recv(sock, dev)) {
3005                 case DOIO_SOFT:
3006                         goto poke;
3007
3008                 case DOIO_EOF:
3009                         /*
3010                          * read of 0 means the remote end was closed.
3011                          * Run through the event queue and dispatch all
3012                          * the events with an EOF result code.
3013                          */
3014                         do {
3015                                 dev->result = ISC_R_EOF;
3016                                 send_recvdone_event(sock, &dev);
3017                                 dev = ISC_LIST_HEAD(sock->recv_list);
3018                         } while (dev != NULL);
3019                         goto poke;
3020
3021                 case DOIO_SUCCESS:
3022                 case DOIO_HARD:
3023                         send_recvdone_event(sock, &dev);
3024                         break;
3025                 }
3026
3027                 dev = ISC_LIST_HEAD(sock->recv_list);
3028         }
3029
3030  poke:
3031         if (!ISC_LIST_EMPTY(sock->recv_list))
3032                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3033
3034         UNLOCK(&sock->lock);
3035 }
3036
3037 static void
3038 internal_send(isc_task_t *me, isc_event_t *ev) {
3039         isc_socketevent_t *dev;
3040         isc_socket_t *sock;
3041
3042         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3043
3044         /*
3045          * Find out what socket this is and lock it.
3046          */
3047         sock = (isc_socket_t *)ev->ev_sender;
3048         INSIST(VALID_SOCKET(sock));
3049
3050         LOCK(&sock->lock);
3051         socket_log(sock, NULL, IOEVENT,
3052                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3053                    "internal_send: task %p got event %p", me, ev);
3054
3055         INSIST(sock->pending_send == 1);
3056         sock->pending_send = 0;
3057
3058         INSIST(sock->references > 0);
3059         sock->references--;  /* the internal event is done with this socket */
3060         if (sock->references == 0) {
3061                 UNLOCK(&sock->lock);
3062                 destroy(&sock);
3063                 return;
3064         }
3065
3066         /*
3067          * Try to do as much I/O as possible on this socket.  There are no
3068          * limits here, currently.
3069          */
3070         dev = ISC_LIST_HEAD(sock->send_list);
3071         while (dev != NULL) {
3072                 switch (doio_send(sock, dev)) {
3073                 case DOIO_SOFT:
3074                         goto poke;
3075
3076                 case DOIO_HARD:
3077                 case DOIO_SUCCESS:
3078                         send_senddone_event(sock, &dev);
3079                         break;
3080                 }
3081
3082                 dev = ISC_LIST_HEAD(sock->send_list);
3083         }
3084
3085  poke:
3086         if (!ISC_LIST_EMPTY(sock->send_list))
3087                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3088
3089         UNLOCK(&sock->lock);
3090 }
3091
3092 static void
3093 internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) {
3094         isc_socket_t *sock;
3095         int more_data;
3096
3097         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3098
3099         /*
3100          * Find out what socket this is and lock it.
3101          */
3102         sock = (isc_socket_t *)ev->ev_sender;
3103         INSIST(VALID_SOCKET(sock));
3104
3105         LOCK(&sock->lock);
3106         socket_log(sock, NULL, IOEVENT,
3107                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3108                    "internal_fdwatch_write: task %p got event %p", me, ev);
3109
3110         INSIST(sock->pending_send == 1);
3111
3112         UNLOCK(&sock->lock);
3113         more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg);
3114         LOCK(&sock->lock);
3115
3116         sock->pending_send = 0;
3117
3118         INSIST(sock->references > 0);
3119         sock->references--;  /* the internal event is done with this socket */
3120         if (sock->references == 0) {
3121                 UNLOCK(&sock->lock);
3122                 destroy(&sock);
3123                 return;
3124         }
3125
3126         if (more_data)
3127                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3128
3129         UNLOCK(&sock->lock);
3130 }
3131
3132 static void
3133 internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) {
3134         isc_socket_t *sock;
3135         int more_data;
3136
3137         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3138
3139         /*
3140          * Find out what socket this is and lock it.
3141          */
3142         sock = (isc_socket_t *)ev->ev_sender;
3143         INSIST(VALID_SOCKET(sock));
3144
3145         LOCK(&sock->lock);
3146         socket_log(sock, NULL, IOEVENT,
3147                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3148                    "internal_fdwatch_read: task %p got event %p", me, ev);
3149
3150         INSIST(sock->pending_recv == 1);
3151
3152         UNLOCK(&sock->lock);
3153         more_data = (sock->fdwatchcb)(me, sock, sock->fdwatcharg);
3154         LOCK(&sock->lock);
3155
3156         sock->pending_recv = 0;
3157
3158         INSIST(sock->references > 0);
3159         sock->references--;  /* the internal event is done with this socket */
3160         if (sock->references == 0) {
3161                 UNLOCK(&sock->lock);
3162                 destroy(&sock);
3163                 return;
3164         }
3165
3166         if (more_data)
3167                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3168
3169         UNLOCK(&sock->lock);
3170 }
3171
3172 /*
3173  * Process read/writes on each fd here.  Avoid locking
3174  * and unlocking twice if both reads and writes are possible.
3175  */
3176 static void
3177 process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
3178            isc_boolean_t writeable)
3179 {
3180         isc_socket_t *sock;
3181         isc_boolean_t unlock_sock;
3182         isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
3183         int lockid = FDLOCK_ID(fd);
3184
3185         /*
3186          * If the socket is going to be closed, don't do more I/O.
3187          */
3188         LOCK(&manager->fdlock[lockid]);
3189         if (manager->fdstate[fd] == CLOSE_PENDING) {
3190                 UNLOCK(&manager->fdlock[lockid]);
3191
3192                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3193                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3194                 return;
3195         }
3196
3197         sock = manager->fds[fd];
3198         unlock_sock = ISC_FALSE;
3199         if (readable) {
3200                 if (sock == NULL) {
3201                         unwatch_read = ISC_TRUE;
3202                         goto check_write;
3203                 }
3204                 unlock_sock = ISC_TRUE;
3205                 LOCK(&sock->lock);
3206                 if (!SOCK_DEAD(sock)) {
3207                         if (sock->listener)
3208                                 dispatch_accept(sock);
3209                         else
3210                                 dispatch_recv(sock);
3211                 }
3212                 unwatch_read = ISC_TRUE;
3213         }
3214 check_write:
3215         if (writeable) {
3216                 if (sock == NULL) {
3217                         unwatch_write = ISC_TRUE;
3218                         goto unlock_fd;
3219                 }
3220                 if (!unlock_sock) {
3221                         unlock_sock = ISC_TRUE;
3222                         LOCK(&sock->lock);
3223                 }
3224                 if (!SOCK_DEAD(sock)) {
3225                         if (sock->connecting)
3226                                 dispatch_connect(sock);
3227                         else
3228                                 dispatch_send(sock);
3229                 }
3230                 unwatch_write = ISC_TRUE;
3231         }
3232         if (unlock_sock)
3233                 UNLOCK(&sock->lock);
3234
3235  unlock_fd:
3236         UNLOCK(&manager->fdlock[lockid]);
3237         if (unwatch_read)
3238                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3239         if (unwatch_write)
3240                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3241
3242 }
3243
3244 #ifdef USE_KQUEUE
3245 static isc_boolean_t
3246 process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) {
3247         int i;
3248         isc_boolean_t readable, writable;
3249         isc_boolean_t done = ISC_FALSE;
3250 #ifdef ISC_PLATFORM_USETHREADS
3251         isc_boolean_t have_ctlevent = ISC_FALSE;
3252 #endif
3253
3254         if (nevents == manager->nevents) {
3255                 /*
3256                  * This is not an error, but something unexpected.  If this
3257                  * happens, it may indicate the need for increasing
3258                  * ISC_SOCKET_MAXEVENTS.
3259                  */
3260                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3261                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3262                             "maximum number of FD events (%d) received",
3263                             nevents);
3264         }
3265
3266         for (i = 0; i < nevents; i++) {
3267                 REQUIRE(events[i].ident < manager->maxsocks);
3268 #ifdef ISC_PLATFORM_USETHREADS
3269                 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
3270                         have_ctlevent = ISC_TRUE;
3271                         continue;
3272                 }
3273 #endif
3274                 readable = ISC_TF(events[i].filter == EVFILT_READ);
3275                 writable = ISC_TF(events[i].filter == EVFILT_WRITE);
3276                 process_fd(manager, events[i].ident, readable, writable);
3277         }
3278
3279 #ifdef ISC_PLATFORM_USETHREADS
3280         if (have_ctlevent)
3281                 done = process_ctlfd(manager);
3282 #endif
3283
3284         return (done);
3285 }
3286 #elif defined(USE_EPOLL)
3287 static isc_boolean_t
3288 process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) {
3289         int i;
3290         isc_boolean_t done = ISC_FALSE;
3291 #ifdef ISC_PLATFORM_USETHREADS
3292         isc_boolean_t have_ctlevent = ISC_FALSE;
3293 #endif
3294
3295         if (nevents == manager->nevents) {
3296                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3297                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3298                             "maximum number of FD events (%d) received",
3299                             nevents);
3300         }
3301
3302         for (i = 0; i < nevents; i++) {
3303                 REQUIRE(events[i].data.fd < (int)manager->maxsocks);
3304 #ifdef ISC_PLATFORM_USETHREADS
3305                 if (events[i].data.fd == manager->pipe_fds[0]) {
3306                         have_ctlevent = ISC_TRUE;
3307                         continue;
3308                 }
3309 #endif
3310                 if ((events[i].events & EPOLLERR) != 0 ||
3311                     (events[i].events & EPOLLHUP) != 0) {
3312                         /*
3313                          * epoll does not set IN/OUT bits on an erroneous
3314                          * condition, so we need to try both anyway.  This is a
3315                          * bit inefficient, but should be okay for such rare
3316                          * events.  Note also that the read or write attempt
3317                          * won't block because we use non-blocking sockets.
3318                          */
3319                         events[i].events |= (EPOLLIN | EPOLLOUT);
3320                 }
3321                 process_fd(manager, events[i].data.fd,
3322                            (events[i].events & EPOLLIN) != 0,
3323                            (events[i].events & EPOLLOUT) != 0);
3324         }
3325
3326 #ifdef ISC_PLATFORM_USETHREADS
3327         if (have_ctlevent)
3328                 done = process_ctlfd(manager);
3329 #endif
3330
3331         return (done);
3332 }
3333 #elif defined(USE_DEVPOLL)
3334 static isc_boolean_t
3335 process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) {
3336         int i;
3337         isc_boolean_t done = ISC_FALSE;
3338 #ifdef ISC_PLATFORM_USETHREADS
3339         isc_boolean_t have_ctlevent = ISC_FALSE;
3340 #endif
3341
3342         if (nevents == manager->nevents) {
3343                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3344                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3345                             "maximum number of FD events (%d) received",
3346                             nevents);
3347         }
3348
3349         for (i = 0; i < nevents; i++) {
3350                 REQUIRE(events[i].fd < (int)manager->maxsocks);
3351 #ifdef ISC_PLATFORM_USETHREADS
3352                 if (events[i].fd == manager->pipe_fds[0]) {
3353                         have_ctlevent = ISC_TRUE;
3354                         continue;
3355                 }
3356 #endif
3357                 process_fd(manager, events[i].fd,
3358                            (events[i].events & POLLIN) != 0,
3359                            (events[i].events & POLLOUT) != 0);
3360         }
3361
3362 #ifdef ISC_PLATFORM_USETHREADS
3363         if (have_ctlevent)
3364                 done = process_ctlfd(manager);
3365 #endif
3366
3367         return (done);
3368 }
3369 #elif defined(USE_SELECT)
3370 static void
3371 process_fds(isc_socketmgr_t *manager, int maxfd,
3372             fd_set *readfds, fd_set *writefds)
3373 {
3374         int i;
3375
3376         REQUIRE(maxfd <= (int)manager->maxsocks);
3377
3378         for (i = 0; i < maxfd; i++) {
3379 #ifdef ISC_PLATFORM_USETHREADS
3380                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
3381                         continue;
3382 #endif /* ISC_PLATFORM_USETHREADS */
3383                 process_fd(manager, i, FD_ISSET(i, readfds),
3384                            FD_ISSET(i, writefds));
3385         }
3386 }
3387 #endif
3388
3389 #ifdef ISC_PLATFORM_USETHREADS
3390 static isc_boolean_t
3391 process_ctlfd(isc_socketmgr_t *manager) {
3392         int msg, fd;
3393
3394         for (;;) {
3395                 select_readmsg(manager, &fd, &msg);
3396
3397                 manager_log(manager, IOEVENT,
3398                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3399                                            ISC_MSG_WATCHERMSG,
3400                                            "watcher got message %d "
3401                                            "for socket %d"), msg, fd);
3402
3403                 /*
3404                  * Nothing to read?
3405                  */
3406                 if (msg == SELECT_POKE_NOTHING)
3407                         break;
3408
3409                 /*
3410                  * Handle shutdown message.  We really should
3411                  * jump out of this loop right away, but
3412                  * it doesn't matter if we have to do a little
3413                  * more work first.
3414                  */
3415                 if (msg == SELECT_POKE_SHUTDOWN)
3416                         return (ISC_TRUE);
3417
3418                 /*
3419                  * This is a wakeup on a socket.  Look
3420                  * at the event queue for both read and write,
3421                  * and decide if we need to watch on it now
3422                  * or not.
3423                  */
3424                 wakeup_socket(manager, fd, msg);
3425         }
3426
3427         return (ISC_FALSE);
3428 }
3429
3430 /*
3431  * This is the thread that will loop forever, always in a select or poll
3432  * call.
3433  *
3434  * When select returns something to do, track down what thread gets to do
3435  * this I/O and post the event to it.
3436  */
3437 static isc_threadresult_t
3438 watcher(void *uap) {
3439         isc_socketmgr_t *manager = uap;
3440         isc_boolean_t done;
3441         int ctlfd;
3442         int cc;
3443 #ifdef USE_KQUEUE
3444         const char *fnname = "kevent()";
3445 #elif defined (USE_EPOLL)
3446         const char *fnname = "epoll_wait()";
3447 #elif defined(USE_DEVPOLL)
3448         const char *fnname = "ioctl(DP_POLL)";
3449         struct dvpoll dvp;
3450 #elif defined (USE_SELECT)
3451         const char *fnname = "select()";
3452         int maxfd;
3453 #endif
3454         char strbuf[ISC_STRERRORSIZE];
3455 #ifdef ISC_SOCKET_USE_POLLWATCH
3456         pollstate_t pollstate = poll_idle;
3457 #endif
3458
3459         /*
3460          * Get the control fd here.  This will never change.
3461          */
3462         ctlfd = manager->pipe_fds[0];
3463         done = ISC_FALSE;
3464         while (!done) {
3465                 do {
3466 #ifdef USE_KQUEUE
3467                         cc = kevent(manager->kqueue_fd, NULL, 0,
3468                                     manager->events, manager->nevents, NULL);
3469 #elif defined(USE_EPOLL)
3470                         cc = epoll_wait(manager->epoll_fd, manager->events,
3471                                         manager->nevents, -1);
3472 #elif defined(USE_DEVPOLL)
3473                         dvp.dp_fds = manager->events;
3474                         dvp.dp_nfds = manager->nevents;
3475 #ifndef ISC_SOCKET_USE_POLLWATCH
3476                         dvp.dp_timeout = -1;
3477 #else
3478                         if (pollstate == poll_idle)
3479                                 dvp.dp_timeout = -1;
3480                         else
3481                                 dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
3482 #endif  /* ISC_SOCKET_USE_POLLWATCH */
3483                         cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
3484 #elif defined(USE_SELECT)
3485                         LOCK(&manager->lock);
3486                         memcpy(manager->read_fds_copy, manager->read_fds,
3487                                manager->fd_bufsize);
3488                         memcpy(manager->write_fds_copy, manager->write_fds,
3489                                manager->fd_bufsize);
3490                         maxfd = manager->maxfd + 1;
3491                         UNLOCK(&manager->lock);
3492
3493                         cc = select(maxfd, manager->read_fds_copy,
3494                                     manager->write_fds_copy, NULL, NULL);
3495 #endif  /* USE_KQUEUE */
3496
3497                         if (cc < 0 && !SOFT_ERROR(errno)) {
3498                                 isc__strerror(errno, strbuf, sizeof(strbuf));
3499                                 FATAL_ERROR(__FILE__, __LINE__,
3500                                             "%s %s: %s", fnname,
3501                                             isc_msgcat_get(isc_msgcat,
3502                                                            ISC_MSGSET_GENERAL,
3503                                                            ISC_MSG_FAILED,
3504                                                            "failed"), strbuf);
3505                         }
3506
3507 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3508                         if (cc == 0) {
3509                                 if (pollstate == poll_active)
3510                                         pollstate = poll_checking;
3511                                 else if (pollstate == poll_checking)
3512                                         pollstate = poll_idle;
3513                         } else if (cc > 0) {
3514                                 if (pollstate == poll_checking) {
3515                                         /*
3516                                          * XXX: We'd like to use a more
3517                                          * verbose log level as it's actually an
3518                                          * unexpected event, but the kernel bug
3519                                          * reportedly happens pretty frequently
3520                                          * (and it can also be a false positive)
3521                                          * so it would be just too noisy.
3522                                          */
3523                                         manager_log(manager,
3524                                                     ISC_LOGCATEGORY_GENERAL,
3525                                                     ISC_LOGMODULE_SOCKET,
3526                                                     ISC_LOG_DEBUG(1),
3527                                                     "unexpected POLL timeout");
3528                                 }
3529                                 pollstate = poll_active;
3530                         }
3531 #endif
3532                 } while (cc < 0);
3533
3534 #if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
3535                 done = process_fds(manager, manager->events, cc);
3536 #elif defined(USE_SELECT)
3537                 process_fds(manager, maxfd, manager->read_fds_copy,
3538                             manager->write_fds_copy);
3539
3540                 /*
3541                  * Process reads on internal, control fd.
3542                  */
3543                 if (FD_ISSET(ctlfd, manager->read_fds_copy))
3544                         done = process_ctlfd(manager);
3545 #endif
3546         }
3547
3548         manager_log(manager, TRACE, "%s",
3549                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3550                                    ISC_MSG_EXITING, "watcher exiting"));
3551
3552         return ((isc_threadresult_t)0);
3553 }
3554 #endif /* ISC_PLATFORM_USETHREADS */
3555
3556 void
3557 isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3558
3559         REQUIRE(VALID_MANAGER(manager));
3560
3561         manager->reserved = reserved;
3562 }
3563
3564 /*
3565  * Create a new socket manager.
3566  */
3567
3568 static isc_result_t
3569 setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3570         isc_result_t result;
3571 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3572         char strbuf[ISC_STRERRORSIZE];
3573 #endif
3574
3575 #ifdef USE_KQUEUE
3576         manager->nevents = ISC_SOCKET_MAXEVENTS;
3577         manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
3578                                       manager->nevents);
3579         if (manager->events == NULL)
3580                 return (ISC_R_NOMEMORY);
3581         manager->kqueue_fd = kqueue();
3582         if (manager->kqueue_fd == -1) {
3583                 result = isc__errno2result(errno);
3584                 isc__strerror(errno, strbuf, sizeof(strbuf));
3585                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3586                                  "kqueue %s: %s",
3587                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3588                                                 ISC_MSG_FAILED, "failed"),
3589                                  strbuf);
3590                 isc_mem_put(mctx, manager->events,
3591                             sizeof(struct kevent) * manager->nevents);
3592                 return (result);
3593         }
3594
3595 #ifdef ISC_PLATFORM_USETHREADS
3596         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3597         if (result != ISC_R_SUCCESS) {
3598                 close(manager->kqueue_fd);
3599                 isc_mem_put(mctx, manager->events,
3600                             sizeof(struct kevent) * manager->nevents);
3601                 return (result);
3602         }
3603 #endif  /* ISC_PLATFORM_USETHREADS */
3604 #elif defined(USE_EPOLL)
3605         manager->nevents = ISC_SOCKET_MAXEVENTS;
3606         manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
3607                                       manager->nevents);
3608         if (manager->events == NULL)
3609                 return (ISC_R_NOMEMORY);
3610         manager->epoll_fd = epoll_create(manager->nevents);
3611         if (manager->epoll_fd == -1) {
3612                 result = isc__errno2result(errno);
3613                 isc__strerror(errno, strbuf, sizeof(strbuf));
3614                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3615                                  "epoll_create %s: %s",
3616                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3617                                                 ISC_MSG_FAILED, "failed"),
3618                                  strbuf);
3619                 isc_mem_put(mctx, manager->events,
3620                             sizeof(struct epoll_event) * manager->nevents);
3621                 return (result);
3622         }
3623 #ifdef ISC_PLATFORM_USETHREADS
3624         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3625         if (result != ISC_R_SUCCESS) {
3626                 close(manager->epoll_fd);
3627                 isc_mem_put(mctx, manager->events,
3628                             sizeof(struct epoll_event) * manager->nevents);
3629                 return (result);
3630         }
3631 #endif  /* ISC_PLATFORM_USETHREADS */
3632 #elif defined(USE_DEVPOLL)
3633         /*
3634          * XXXJT: /dev/poll seems to reject large numbers of events,
3635          * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
3636          */
3637         manager->nevents = ISC_SOCKET_MAXEVENTS;
3638         manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
3639                                       manager->nevents);
3640         if (manager->events == NULL)
3641                 return (ISC_R_NOMEMORY);
3642         /*
3643          * Note: fdpollinfo should be able to support all possible FDs, so
3644          * it must have maxsocks entries (not nevents).
3645          */
3646         manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
3647                                           manager->maxsocks);
3648         if (manager->fdpollinfo == NULL) {
3649                 isc_mem_put(mctx, manager->events,
3650                             sizeof(struct pollfd) * manager->nevents);
3651                 return (ISC_R_NOMEMORY);
3652         }
3653         memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
3654         manager->devpoll_fd = open("/dev/poll", O_RDWR);
3655         if (manager->devpoll_fd == -1) {
3656                 result = isc__errno2result(errno);
3657                 isc__strerror(errno, strbuf, sizeof(strbuf));
3658                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3659                                  "open(/dev/poll) %s: %s",
3660                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3661                                                 ISC_MSG_FAILED, "failed"),
3662                                  strbuf);
3663                 isc_mem_put(mctx, manager->events,
3664                             sizeof(struct pollfd) * manager->nevents);
3665                 isc_mem_put(mctx, manager->fdpollinfo,
3666                             sizeof(pollinfo_t) * manager->maxsocks);
3667                 return (result);
3668         }
3669 #ifdef ISC_PLATFORM_USETHREADS
3670         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3671         if (result != ISC_R_SUCCESS) {
3672                 close(manager->devpoll_fd);
3673                 isc_mem_put(mctx, manager->events,
3674                             sizeof(struct pollfd) * manager->nevents);
3675                 isc_mem_put(mctx, manager->fdpollinfo,
3676                             sizeof(pollinfo_t) * manager->maxsocks);
3677                 return (result);
3678         }
3679 #endif  /* ISC_PLATFORM_USETHREADS */
3680 #elif defined(USE_SELECT)
3681         UNUSED(result);
3682
3683 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3684         /*
3685          * Note: this code should also cover the case of MAXSOCKETS <=
3686          * FD_SETSIZE, but we separate the cases to avoid possible portability
3687          * issues regarding howmany() and the actual representation of fd_set.
3688          */
3689         manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3690                 sizeof(fd_mask);
3691 #else
3692         manager->fd_bufsize = sizeof(fd_set);
3693 #endif
3694
3695         manager->read_fds = NULL;
3696         manager->read_fds_copy = NULL;
3697         manager->write_fds = NULL;
3698         manager->write_fds_copy = NULL;
3699
3700         manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
3701         if (manager->read_fds != NULL)
3702                 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
3703         if (manager->read_fds_copy != NULL)
3704                 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
3705         if (manager->write_fds != NULL) {
3706                 manager->write_fds_copy = isc_mem_get(mctx,
3707                                                       manager->fd_bufsize);
3708         }
3709         if (manager->write_fds_copy == NULL) {
3710                 if (manager->write_fds != NULL) {
3711                         isc_mem_put(mctx, manager->write_fds,
3712                                     manager->fd_bufsize);
3713                 }
3714                 if (manager->read_fds_copy != NULL) {
3715                         isc_mem_put(mctx, manager->read_fds_copy,
3716                                     manager->fd_bufsize);
3717                 }
3718                 if (manager->read_fds != NULL) {
3719                         isc_mem_put(mctx, manager->read_fds,
3720                                     manager->fd_bufsize);
3721                 }
3722                 return (ISC_R_NOMEMORY);
3723         }
3724         memset(manager->read_fds, 0, manager->fd_bufsize);
3725         memset(manager->write_fds, 0, manager->fd_bufsize);
3726
3727 #ifdef ISC_PLATFORM_USETHREADS
3728         (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3729         manager->maxfd = manager->pipe_fds[0];
3730 #else /* ISC_PLATFORM_USETHREADS */
3731         manager->maxfd = 0;
3732 #endif /* ISC_PLATFORM_USETHREADS */
3733 #endif  /* USE_KQUEUE */
3734
3735         return (ISC_R_SUCCESS);
3736 }
3737
3738 static void
3739 cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
3740 #ifdef ISC_PLATFORM_USETHREADS
3741         isc_result_t result;
3742
3743         result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3744         if (result != ISC_R_SUCCESS) {
3745                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3746                                  "epoll_ctl(DEL) %s",
3747                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3748                                                 ISC_MSG_FAILED, "failed"));
3749         }
3750 #endif  /* ISC_PLATFORM_USETHREADS */
3751
3752 #ifdef USE_KQUEUE
3753         close(manager->kqueue_fd);
3754         isc_mem_put(mctx, manager->events,
3755                     sizeof(struct kevent) * manager->nevents);
3756 #elif defined(USE_EPOLL)
3757         close(manager->epoll_fd);
3758         isc_mem_put(mctx, manager->events,
3759                     sizeof(struct epoll_event) * manager->nevents);
3760 #elif defined(USE_DEVPOLL)
3761         close(manager->devpoll_fd);
3762         isc_mem_put(mctx, manager->events,
3763                     sizeof(struct pollfd) * manager->nevents);
3764         isc_mem_put(mctx, manager->fdpollinfo,
3765                     sizeof(pollinfo_t) * manager->maxsocks);
3766 #elif defined(USE_SELECT)
3767         if (manager->read_fds != NULL)
3768                 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
3769         if (manager->read_fds_copy != NULL)
3770                 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
3771         if (manager->write_fds != NULL)
3772                 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
3773         if (manager->write_fds_copy != NULL)
3774                 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
3775 #endif  /* USE_KQUEUE */
3776 }
3777
3778 isc_result_t
3779 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3780         return (isc_socketmgr_create2(mctx, managerp, 0));
3781 }
3782
3783 isc_result_t
3784 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3785                       unsigned int maxsocks)
3786 {
3787         int i;
3788         isc_socketmgr_t *manager;
3789 #ifdef ISC_PLATFORM_USETHREADS
3790         char strbuf[ISC_STRERRORSIZE];
3791 #endif
3792         isc_result_t result;
3793
3794         REQUIRE(managerp != NULL && *managerp == NULL);
3795
3796 #ifndef ISC_PLATFORM_USETHREADS
3797         if (socketmgr != NULL) {
3798                 /* Don't allow maxsocks to be updated */
3799                 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
3800                         return (ISC_R_EXISTS);
3801
3802                 socketmgr->refs++;
3803                 *managerp = socketmgr;
3804                 return (ISC_R_SUCCESS);
3805         }
3806 #endif /* ISC_PLATFORM_USETHREADS */
3807
3808         if (maxsocks == 0)
3809                 maxsocks = ISC_SOCKET_MAXSOCKETS;
3810
3811         manager = isc_mem_get(mctx, sizeof(*manager));
3812         if (manager == NULL)
3813                 return (ISC_R_NOMEMORY);
3814
3815         /* zero-clear so that necessary cleanup on failure will be easy */
3816         memset(manager, 0, sizeof(*manager));
3817         manager->maxsocks = maxsocks;
3818         manager->reserved = 0;
3819         manager->fds = isc_mem_get(mctx,
3820                                    manager->maxsocks * sizeof(isc_socket_t *));
3821         if (manager->fds == NULL) {
3822                 result = ISC_R_NOMEMORY;
3823                 goto free_manager;
3824         }
3825         manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
3826         if (manager->fdstate == NULL) {
3827                 result = ISC_R_NOMEMORY;
3828                 goto free_manager;
3829         }
3830         manager->stats = NULL;
3831
3832         manager->magic = SOCKET_MANAGER_MAGIC;
3833         manager->mctx = NULL;
3834         memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
3835         ISC_LIST_INIT(manager->socklist);
3836         result = isc_mutex_init(&manager->lock);
3837         if (result != ISC_R_SUCCESS)
3838                 goto free_manager;
3839         manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
3840         if (manager->fdlock == NULL) {
3841                 result = ISC_R_NOMEMORY;
3842                 goto cleanup_lock;
3843         }
3844         for (i = 0; i < FDLOCK_COUNT; i++) {
3845                 result = isc_mutex_init(&manager->fdlock[i]);
3846                 if (result != ISC_R_SUCCESS) {
3847                         while (--i >= 0)
3848                                 DESTROYLOCK(&manager->fdlock[i]);
3849                         isc_mem_put(mctx, manager->fdlock,
3850                                     FDLOCK_COUNT * sizeof(isc_mutex_t));
3851                         manager->fdlock = NULL;
3852                         goto cleanup_lock;
3853                 }
3854         }
3855
3856 #ifdef ISC_PLATFORM_USETHREADS
3857         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
3858                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3859                                  "isc_condition_init() %s",
3860                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3861                                                 ISC_MSG_FAILED, "failed"));
3862                 result = ISC_R_UNEXPECTED;
3863                 goto cleanup_lock;
3864         }
3865
3866         /*
3867          * Create the special fds that will be used to wake up the
3868          * select/poll loop when something internal needs to be done.
3869          */
3870         if (pipe(manager->pipe_fds) != 0) {
3871                 isc__strerror(errno, strbuf, sizeof(strbuf));
3872                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3873                                  "pipe() %s: %s",
3874                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3875                                                 ISC_MSG_FAILED, "failed"),
3876                                  strbuf);
3877                 result = ISC_R_UNEXPECTED;
3878                 goto cleanup_condition;
3879         }
3880
3881         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
3882 #if 0
3883         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
3884 #endif
3885 #else /* ISC_PLATFORM_USETHREADS */
3886         manager->refs = 1;
3887 #endif /* ISC_PLATFORM_USETHREADS */
3888
3889         /*
3890          * Set up initial state for the select loop
3891          */
3892         result = setup_watcher(mctx, manager);
3893         if (result != ISC_R_SUCCESS)
3894                 goto cleanup;
3895         memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
3896 #ifdef ISC_PLATFORM_USETHREADS
3897         /*
3898          * Start up the select/poll thread.
3899          */
3900         if (isc_thread_create(watcher, manager, &manager->watcher) !=
3901             ISC_R_SUCCESS) {
3902                 UNEXPECTED_ERROR(__FILE__, __LINE__,
3903                                  "isc_thread_create() %s",
3904                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3905                                                 ISC_MSG_FAILED, "failed"));
3906                 cleanup_watcher(mctx, manager);
3907                 result = ISC_R_UNEXPECTED;
3908                 goto cleanup;
3909         }
3910 #endif /* ISC_PLATFORM_USETHREADS */
3911         isc_mem_attach(mctx, &manager->mctx);
3912
3913 #ifndef ISC_PLATFORM_USETHREADS
3914         socketmgr = manager;
3915 #endif /* ISC_PLATFORM_USETHREADS */
3916         *managerp = manager;
3917
3918         return (ISC_R_SUCCESS);
3919
3920 cleanup:
3921 #ifdef ISC_PLATFORM_USETHREADS
3922         (void)close(manager->pipe_fds[0]);
3923         (void)close(manager->pipe_fds[1]);
3924 #endif  /* ISC_PLATFORM_USETHREADS */
3925
3926 #ifdef ISC_PLATFORM_USETHREADS
3927 cleanup_condition:
3928         (void)isc_condition_destroy(&manager->shutdown_ok);
3929 #endif  /* ISC_PLATFORM_USETHREADS */
3930
3931
3932 cleanup_lock:
3933         if (manager->fdlock != NULL) {
3934                 for (i = 0; i < FDLOCK_COUNT; i++)
3935                         DESTROYLOCK(&manager->fdlock[i]);
3936         }
3937         DESTROYLOCK(&manager->lock);
3938
3939 free_manager:
3940         if (manager->fdlock != NULL) {
3941                 isc_mem_put(mctx, manager->fdlock,
3942                             FDLOCK_COUNT * sizeof(isc_mutex_t));
3943         }
3944         if (manager->fdstate != NULL) {
3945                 isc_mem_put(mctx, manager->fdstate,
3946                             manager->maxsocks * sizeof(int));
3947         }
3948         if (manager->fds != NULL) {
3949                 isc_mem_put(mctx, manager->fds,
3950                             manager->maxsocks * sizeof(isc_socket_t *));
3951         }
3952         isc_mem_put(mctx, manager, sizeof(*manager));
3953
3954         return (result);
3955 }
3956
3957 isc_result_t
3958 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3959         REQUIRE(VALID_MANAGER(manager));
3960         REQUIRE(nsockp != NULL);
3961
3962         *nsockp = manager->maxsocks;
3963
3964         return (ISC_R_SUCCESS);
3965 }
3966
3967 void
3968 isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
3969         REQUIRE(VALID_MANAGER(manager));
3970         REQUIRE(ISC_LIST_EMPTY(manager->socklist));
3971         REQUIRE(manager->stats == NULL);
3972         REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
3973
3974         isc_stats_attach(stats, &manager->stats);
3975 }
3976
3977 void
3978 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
3979         isc_socketmgr_t *manager;
3980         int i;
3981         isc_mem_t *mctx;
3982
3983         /*
3984          * Destroy a socket manager.
3985          */
3986
3987         REQUIRE(managerp != NULL);
3988         manager = *managerp;
3989         REQUIRE(VALID_MANAGER(manager));
3990
3991 #ifndef ISC_PLATFORM_USETHREADS
3992         if (manager->refs > 1) {
3993                 manager->refs--;
3994                 *managerp = NULL;
3995                 return;
3996         }
3997 #endif /* ISC_PLATFORM_USETHREADS */
3998
3999         LOCK(&manager->lock);
4000
4001 #ifdef ISC_PLATFORM_USETHREADS
4002         /*
4003          * Wait for all sockets to be destroyed.
4004          */
4005         while (!ISC_LIST_EMPTY(manager->socklist)) {
4006                 manager_log(manager, CREATION, "%s",
4007                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4008                                            ISC_MSG_SOCKETSREMAIN,
4009                                            "sockets exist"));
4010                 WAIT(&manager->shutdown_ok, &manager->lock);
4011         }
4012 #else /* ISC_PLATFORM_USETHREADS */
4013         /*
4014          * Hope all sockets have been destroyed.
4015          */
4016         if (!ISC_LIST_EMPTY(manager->socklist)) {
4017                 manager_log(manager, CREATION, "%s",
4018                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4019                                            ISC_MSG_SOCKETSREMAIN,
4020                                            "sockets exist"));
4021                 INSIST(0);
4022         }
4023 #endif /* ISC_PLATFORM_USETHREADS */
4024
4025         UNLOCK(&manager->lock);
4026
4027         /*
4028          * Here, poke our select/poll thread.  Do this by closing the write
4029          * half of the pipe, which will send EOF to the read half.
4030          * This is currently a no-op in the non-threaded case.
4031          */
4032         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
4033
4034 #ifdef ISC_PLATFORM_USETHREADS
4035         /*
4036          * Wait for thread to exit.
4037          */
4038         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
4039                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4040                                  "isc_thread_join() %s",
4041                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4042                                                 ISC_MSG_FAILED, "failed"));
4043 #endif /* ISC_PLATFORM_USETHREADS */
4044
4045         /*
4046          * Clean up.
4047          */
4048         cleanup_watcher(manager->mctx, manager);
4049
4050 #ifdef ISC_PLATFORM_USETHREADS
4051         (void)close(manager->pipe_fds[0]);
4052         (void)close(manager->pipe_fds[1]);
4053         (void)isc_condition_destroy(&manager->shutdown_ok);
4054 #endif /* ISC_PLATFORM_USETHREADS */
4055
4056         for (i = 0; i < (int)manager->maxsocks; i++)
4057                 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
4058                         (void)close(i);
4059
4060         isc_mem_put(manager->mctx, manager->fds,
4061                     manager->maxsocks * sizeof(isc_socket_t *));
4062         isc_mem_put(manager->mctx, manager->fdstate,
4063                     manager->maxsocks * sizeof(int));
4064
4065         if (manager->stats != NULL)
4066                 isc_stats_detach(&manager->stats);
4067
4068         if (manager->fdlock != NULL) {
4069                 for (i = 0; i < FDLOCK_COUNT; i++)
4070                         DESTROYLOCK(&manager->fdlock[i]);
4071                 isc_mem_put(manager->mctx, manager->fdlock,
4072                             FDLOCK_COUNT * sizeof(isc_mutex_t));
4073         }
4074         DESTROYLOCK(&manager->lock);
4075         manager->magic = 0;
4076         mctx= manager->mctx;
4077         isc_mem_put(mctx, manager, sizeof(*manager));
4078
4079         isc_mem_detach(&mctx);
4080
4081         *managerp = NULL;
4082 }
4083
4084 static isc_result_t
4085 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4086             unsigned int flags)
4087 {
4088         int io_state;
4089         isc_boolean_t have_lock = ISC_FALSE;
4090         isc_task_t *ntask = NULL;
4091         isc_result_t result = ISC_R_SUCCESS;
4092
4093         dev->ev_sender = task;
4094
4095         if (sock->type == isc_sockettype_udp) {
4096                 io_state = doio_recv(sock, dev);
4097         } else {
4098                 LOCK(&sock->lock);
4099                 have_lock = ISC_TRUE;
4100
4101                 if (ISC_LIST_EMPTY(sock->recv_list))
4102                         io_state = doio_recv(sock, dev);
4103                 else
4104                         io_state = DOIO_SOFT;
4105         }
4106
4107         switch (io_state) {
4108         case DOIO_SOFT:
4109                 /*
4110                  * We couldn't read all or part of the request right now, so
4111                  * queue it.
4112                  *
4113                  * Attach to socket and to task
4114                  */
4115                 isc_task_attach(task, &ntask);
4116                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4117
4118                 if (!have_lock) {
4119                         LOCK(&sock->lock);
4120                         have_lock = ISC_TRUE;
4121                 }
4122
4123                 /*
4124                  * Enqueue the request.  If the socket was previously not being
4125                  * watched, poke the watcher to start paying attention to it.
4126                  */
4127                 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
4128                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
4129                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4130
4131                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
4132                            "socket_recv: event %p -> task %p",
4133                            dev, ntask);
4134
4135                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4136                         result = ISC_R_INPROGRESS;
4137                 break;
4138
4139         case DOIO_EOF:
4140                 dev->result = ISC_R_EOF;
4141                 /* fallthrough */
4142
4143         case DOIO_HARD:
4144         case DOIO_SUCCESS:
4145                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4146                         send_recvdone_event(sock, &dev);
4147                 break;
4148         }
4149
4150         if (have_lock)
4151                 UNLOCK(&sock->lock);
4152
4153         return (result);
4154 }
4155
4156 isc_result_t
4157 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4158                  unsigned int minimum, isc_task_t *task,
4159                  isc_taskaction_t action, const void *arg)
4160 {
4161         isc_socketevent_t *dev;
4162         isc_socketmgr_t *manager;
4163         unsigned int iocount;
4164         isc_buffer_t *buffer;
4165
4166         REQUIRE(VALID_SOCKET(sock));
4167         REQUIRE(buflist != NULL);
4168         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4169         REQUIRE(task != NULL);
4170         REQUIRE(action != NULL);
4171
4172         manager = sock->manager;
4173         REQUIRE(VALID_MANAGER(manager));
4174
4175         iocount = isc_bufferlist_availablecount(buflist);
4176         REQUIRE(iocount > 0);
4177
4178         INSIST(sock->bound);
4179
4180         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4181         if (dev == NULL) {
4182                 return (ISC_R_NOMEMORY);
4183         }
4184
4185         /*
4186          * UDP sockets are always partial read
4187          */
4188         if (sock->type == isc_sockettype_udp)
4189                 dev->minimum = 1;
4190         else {
4191                 if (minimum == 0)
4192                         dev->minimum = iocount;
4193                 else
4194                         dev->minimum = minimum;
4195         }
4196
4197         /*
4198          * Move each buffer from the passed in list to our internal one.
4199          */
4200         buffer = ISC_LIST_HEAD(*buflist);
4201         while (buffer != NULL) {
4202                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4203                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4204                 buffer = ISC_LIST_HEAD(*buflist);
4205         }
4206
4207         return (socket_recv(sock, dev, task, 0));
4208 }
4209
4210 isc_result_t
4211 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
4212                 isc_task_t *task, isc_taskaction_t action, const void *arg)
4213 {
4214         isc_socketevent_t *dev;
4215         isc_socketmgr_t *manager;
4216
4217         REQUIRE(VALID_SOCKET(sock));
4218         REQUIRE(action != NULL);
4219
4220         manager = sock->manager;
4221         REQUIRE(VALID_MANAGER(manager));
4222
4223         INSIST(sock->bound);
4224
4225         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4226         if (dev == NULL)
4227                 return (ISC_R_NOMEMORY);
4228
4229         return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
4230 }
4231
4232 isc_result_t
4233 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
4234                  unsigned int minimum, isc_task_t *task,
4235                  isc_socketevent_t *event, unsigned int flags)
4236 {
4237         event->ev_sender = sock;
4238         event->result = ISC_R_UNEXPECTED;
4239         ISC_LIST_INIT(event->bufferlist);
4240         event->region = *region;
4241         event->n = 0;
4242         event->offset = 0;
4243         event->attributes = 0;
4244
4245         /*
4246          * UDP sockets are always partial read.
4247          */
4248         if (sock->type == isc_sockettype_udp)
4249                 event->minimum = 1;
4250         else {
4251                 if (minimum == 0)
4252                         event->minimum = region->length;
4253                 else
4254                         event->minimum = minimum;
4255         }
4256
4257         return (socket_recv(sock, event, task, flags));
4258 }
4259
4260 static isc_result_t
4261 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4262             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4263             unsigned int flags)
4264 {
4265         int io_state;
4266         isc_boolean_t have_lock = ISC_FALSE;
4267         isc_task_t *ntask = NULL;
4268         isc_result_t result = ISC_R_SUCCESS;
4269
4270         dev->ev_sender = task;
4271
4272         set_dev_address(address, sock, dev);
4273         if (pktinfo != NULL) {
4274                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4275                 dev->pktinfo = *pktinfo;
4276
4277                 if (!isc_sockaddr_issitelocal(&dev->address) &&
4278                     !isc_sockaddr_islinklocal(&dev->address)) {
4279                         socket_log(sock, NULL, TRACE, isc_msgcat,
4280                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
4281                                    "pktinfo structure provided, ifindex %u "
4282                                    "(set to 0)", pktinfo->ipi6_ifindex);
4283
4284                         /*
4285                          * Set the pktinfo index to 0 here, to let the
4286                          * kernel decide what interface it should send on.
4287                          */
4288                         dev->pktinfo.ipi6_ifindex = 0;
4289                 }
4290         }
4291
4292         if (sock->type == isc_sockettype_udp)
4293                 io_state = doio_send(sock, dev);
4294         else {
4295                 LOCK(&sock->lock);
4296                 have_lock = ISC_TRUE;
4297
4298                 if (ISC_LIST_EMPTY(sock->send_list))
4299                         io_state = doio_send(sock, dev);
4300                 else
4301                         io_state = DOIO_SOFT;
4302         }
4303
4304         switch (io_state) {
4305         case DOIO_SOFT:
4306                 /*
4307                  * We couldn't send all or part of the request right now, so
4308                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
4309                  */
4310                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4311                         isc_task_attach(task, &ntask);
4312                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4313
4314                         if (!have_lock) {
4315                                 LOCK(&sock->lock);
4316                                 have_lock = ISC_TRUE;
4317                         }
4318
4319                         /*
4320                          * Enqueue the request.  If the socket was previously
4321                          * not being watched, poke the watcher to start
4322                          * paying attention to it.
4323                          */
4324                         if (ISC_LIST_EMPTY(sock->send_list) &&
4325                             !sock->pending_send)
4326                                 select_poke(sock->manager, sock->fd,
4327                                             SELECT_POKE_WRITE);
4328                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4329
4330                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
4331                                    "socket_send: event %p -> task %p",
4332                                    dev, ntask);
4333
4334                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4335                                 result = ISC_R_INPROGRESS;
4336                         break;
4337                 }
4338
4339         case DOIO_HARD:
4340         case DOIO_SUCCESS:
4341                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4342                         send_senddone_event(sock, &dev);
4343                 break;
4344         }
4345
4346         if (have_lock)
4347                 UNLOCK(&sock->lock);
4348
4349         return (result);
4350 }
4351
4352 isc_result_t
4353 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
4354                 isc_task_t *task, isc_taskaction_t action, const void *arg)
4355 {
4356         /*
4357          * REQUIRE() checking is performed in isc_socket_sendto().
4358          */
4359         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
4360                                   NULL));
4361 }
4362
4363 isc_result_t
4364 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
4365                   isc_task_t *task, isc_taskaction_t action, const void *arg,
4366                   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4367 {
4368         isc_socketevent_t *dev;
4369         isc_socketmgr_t *manager;
4370
4371         REQUIRE(VALID_SOCKET(sock));
4372         REQUIRE(region != NULL);
4373         REQUIRE(task != NULL);
4374         REQUIRE(action != NULL);
4375
4376         manager = sock->manager;
4377         REQUIRE(VALID_MANAGER(manager));
4378
4379         INSIST(sock->bound);
4380
4381         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4382         if (dev == NULL) {
4383                 return (ISC_R_NOMEMORY);
4384         }
4385
4386         dev->region = *region;
4387
4388         return (socket_send(sock, dev, task, address, pktinfo, 0));
4389 }
4390
4391 isc_result_t
4392 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4393                  isc_task_t *task, isc_taskaction_t action, const void *arg)
4394 {
4395         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
4396                                    NULL));
4397 }
4398
4399 isc_result_t
4400 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
4401                    isc_task_t *task, isc_taskaction_t action, const void *arg,
4402                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4403 {
4404         isc_socketevent_t *dev;
4405         isc_socketmgr_t *manager;
4406         unsigned int iocount;
4407         isc_buffer_t *buffer;
4408
4409         REQUIRE(VALID_SOCKET(sock));
4410         REQUIRE(buflist != NULL);
4411         REQUIRE(!ISC_LIST_EMPTY(*buflist));
4412         REQUIRE(task != NULL);
4413         REQUIRE(action != NULL);
4414
4415         manager = sock->manager;
4416         REQUIRE(VALID_MANAGER(manager));
4417
4418         iocount = isc_bufferlist_usedcount(buflist);
4419         REQUIRE(iocount > 0);
4420
4421         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4422         if (dev == NULL) {
4423                 return (ISC_R_NOMEMORY);
4424         }
4425
4426         /*
4427          * Move each buffer from the passed in list to our internal one.
4428          */
4429         buffer = ISC_LIST_HEAD(*buflist);
4430         while (buffer != NULL) {
4431                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
4432                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4433                 buffer = ISC_LIST_HEAD(*buflist);
4434         }
4435
4436         return (socket_send(sock, dev, task, address, pktinfo, 0));
4437 }
4438
4439 isc_result_t
4440 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
4441                    isc_task_t *task,
4442                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4443                    isc_socketevent_t *event, unsigned int flags)
4444 {
4445         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
4446         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
4447                 REQUIRE(sock->type == isc_sockettype_udp);
4448         event->ev_sender = sock;
4449         event->result = ISC_R_UNEXPECTED;
4450         ISC_LIST_INIT(event->bufferlist);
4451         event->region = *region;
4452         event->n = 0;
4453         event->offset = 0;
4454         event->attributes = 0;
4455
4456         return (socket_send(sock, event, task, address, pktinfo, flags));
4457 }
4458
4459 void
4460 isc_socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
4461 #ifdef ISC_PLATFORM_HAVESYSUNH
4462         int s;
4463         struct stat sb;
4464         char strbuf[ISC_STRERRORSIZE];
4465
4466         if (sockaddr->type.sa.sa_family != AF_UNIX)
4467                 return;
4468
4469 #ifndef S_ISSOCK
4470 #if defined(S_IFMT) && defined(S_IFSOCK)
4471 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
4472 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4473 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
4474 #endif
4475 #endif
4476
4477 #ifndef S_ISFIFO
4478 #if defined(S_IFMT) && defined(S_IFIFO)
4479 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
4480 #elif defined(_S_IFMT) && defined(S_IFIFO)
4481 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
4482 #endif
4483 #endif
4484
4485 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4486 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4487 #endif
4488
4489 #ifndef S_ISFIFO
4490 #define S_ISFIFO(mode) 0
4491 #endif
4492
4493 #ifndef S_ISSOCK
4494 #define S_ISSOCK(mode) 0
4495 #endif
4496
4497         if (active) {
4498                 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4499                         isc__strerror(errno, strbuf, sizeof(strbuf));
4500                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4501                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4502                                       "isc_socket_cleanunix: stat(%s): %s",
4503                                       sockaddr->type.sunix.sun_path, strbuf);
4504                         return;
4505                 }
4506                 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4507                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4508                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4509                                       "isc_socket_cleanunix: %s: not a socket",
4510                                       sockaddr->type.sunix.sun_path);
4511                         return;
4512                 }
4513                 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4514                         isc__strerror(errno, strbuf, sizeof(strbuf));
4515                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4516                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4517                                       "isc_socket_cleanunix: unlink(%s): %s",
4518                                       sockaddr->type.sunix.sun_path, strbuf);
4519                 }
4520                 return;
4521         }
4522
4523         s = socket(AF_UNIX, SOCK_STREAM, 0);
4524         if (s < 0) {
4525                 isc__strerror(errno, strbuf, sizeof(strbuf));
4526                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4527                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4528                               "isc_socket_cleanunix: socket(%s): %s",
4529                               sockaddr->type.sunix.sun_path, strbuf);
4530                 return;
4531         }
4532
4533         if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4534                 switch (errno) {
4535                 case ENOENT:    /* We exited cleanly last time */
4536                         break;
4537                 default:
4538                         isc__strerror(errno, strbuf, sizeof(strbuf));
4539                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4540                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4541                                       "isc_socket_cleanunix: stat(%s): %s",
4542                                       sockaddr->type.sunix.sun_path, strbuf);
4543                         break;
4544                 }
4545                 goto cleanup;
4546         }
4547
4548         if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4549                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4550                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4551                               "isc_socket_cleanunix: %s: not a socket",
4552                               sockaddr->type.sunix.sun_path);
4553                 goto cleanup;
4554         }
4555
4556         if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
4557                     sizeof(sockaddr->type.sunix)) < 0) {
4558                 switch (errno) {
4559                 case ECONNREFUSED:
4560                 case ECONNRESET:
4561                         if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4562                                 isc__strerror(errno, strbuf, sizeof(strbuf));
4563                                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4564                                               ISC_LOGMODULE_SOCKET,
4565                                               ISC_LOG_WARNING,
4566                                               "isc_socket_cleanunix: "
4567                                               "unlink(%s): %s",
4568                                               sockaddr->type.sunix.sun_path,
4569                                               strbuf);
4570                         }
4571                         break;
4572                 default:
4573                         isc__strerror(errno, strbuf, sizeof(strbuf));
4574                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4575                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4576                                       "isc_socket_cleanunix: connect(%s): %s",
4577                                       sockaddr->type.sunix.sun_path, strbuf);
4578                         break;
4579                 }
4580         }
4581  cleanup:
4582         close(s);
4583 #else
4584         UNUSED(sockaddr);
4585         UNUSED(active);
4586 #endif
4587 }
4588
4589 isc_result_t
4590 isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
4591                     isc_uint32_t owner, isc_uint32_t group)
4592 {
4593 #ifdef ISC_PLATFORM_HAVESYSUNH
4594         isc_result_t result = ISC_R_SUCCESS;
4595         char strbuf[ISC_STRERRORSIZE];
4596         char path[sizeof(sockaddr->type.sunix.sun_path)];
4597 #ifdef NEED_SECURE_DIRECTORY
4598         char *slash;
4599 #endif
4600
4601         REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4602         INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4603         strcpy(path, sockaddr->type.sunix.sun_path);
4604
4605 #ifdef NEED_SECURE_DIRECTORY
4606         slash = strrchr(path, '/');
4607         if (slash != NULL) {
4608                 if (slash != path)
4609                         *slash = '\0';
4610                 else
4611                         strcpy(path, "/");
4612         } else
4613                 strcpy(path, ".");
4614 #endif
4615
4616         if (chmod(path, perm) < 0) {
4617                 isc__strerror(errno, strbuf, sizeof(strbuf));
4618                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4619                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4620                               "isc_socket_permunix: chmod(%s, %d): %s",
4621                               path, perm, strbuf);
4622                 result = ISC_R_FAILURE;
4623         }
4624         if (chown(path, owner, group) < 0) {
4625                 isc__strerror(errno, strbuf, sizeof(strbuf));
4626                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4627                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4628                               "isc_socket_permunix: chown(%s, %d, %d): %s",
4629                               path, owner, group,
4630                               strbuf);
4631                 result = ISC_R_FAILURE;
4632         }
4633         return (result);
4634 #else
4635         UNUSED(sockaddr);
4636         UNUSED(perm);
4637         UNUSED(owner);
4638         UNUSED(group);
4639         return (ISC_R_NOTIMPLEMENTED);
4640 #endif
4641 }
4642
4643 isc_result_t
4644 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
4645                 unsigned int options) {
4646         char strbuf[ISC_STRERRORSIZE];
4647         int on = 1;
4648
4649         LOCK(&sock->lock);
4650
4651         INSIST(!sock->bound);
4652
4653         if (sock->pf != sockaddr->type.sa.sa_family) {
4654                 UNLOCK(&sock->lock);
4655                 return (ISC_R_FAMILYMISMATCH);
4656         }
4657         /*
4658          * Only set SO_REUSEADDR when we want a specific port.
4659          */
4660 #ifdef AF_UNIX
4661         if (sock->pf == AF_UNIX)
4662                 goto bind_socket;
4663 #endif
4664         if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4665             isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
4666             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4667                        sizeof(on)) < 0) {
4668                 UNEXPECTED_ERROR(__FILE__, __LINE__,
4669                                  "setsockopt(%d) %s", sock->fd,
4670                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4671                                                 ISC_MSG_FAILED, "failed"));
4672                 /* Press on... */
4673         }
4674 #ifdef AF_UNIX
4675  bind_socket:
4676 #endif
4677         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4678                 inc_stats(sock->manager->stats,
4679                           sock->statsindex[STATID_BINDFAIL]);
4680
4681                 UNLOCK(&sock->lock);
4682                 switch (errno) {
4683                 case EACCES:
4684                         return (ISC_R_NOPERM);
4685                 case EADDRNOTAVAIL:
4686                         return (ISC_R_ADDRNOTAVAIL);
4687                 case EADDRINUSE:
4688                         return (ISC_R_ADDRINUSE);
4689                 case EINVAL:
4690                         return (ISC_R_BOUND);
4691                 default:
4692                         isc__strerror(errno, strbuf, sizeof(strbuf));
4693                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4694                                          strbuf);
4695                         return (ISC_R_UNEXPECTED);
4696                 }
4697         }
4698
4699         socket_log(sock, sockaddr, TRACE,
4700                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
4701         sock->bound = 1;
4702
4703         UNLOCK(&sock->lock);
4704         return (ISC_R_SUCCESS);
4705 }
4706
4707 isc_result_t
4708 isc_socket_filter(isc_socket_t *sock, const char *filter) {
4709 #ifdef SO_ACCEPTFILTER
4710         char strbuf[ISC_STRERRORSIZE];
4711         struct accept_filter_arg afa;
4712 #else
4713         UNUSED(sock);
4714         UNUSED(filter);
4715 #endif
4716
4717         REQUIRE(VALID_SOCKET(sock));
4718
4719 #ifdef SO_ACCEPTFILTER
4720         bzero(&afa, sizeof(afa));
4721         strncpy(afa.af_name, filter, sizeof(afa.af_name));
4722         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
4723                          &afa, sizeof(afa)) == -1) {
4724                 isc__strerror(errno, strbuf, sizeof(strbuf));
4725                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
4726                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
4727                            strbuf);
4728                 return (ISC_R_FAILURE);
4729         }
4730         return (ISC_R_SUCCESS);
4731 #else
4732         return (ISC_R_NOTIMPLEMENTED);
4733 #endif
4734 }
4735
4736 /*
4737  * Set up to listen on a given socket.  We do this by creating an internal
4738  * event that will be dispatched when the socket has read activity.  The
4739  * watcher will send the internal event to the task when there is a new
4740  * connection.
4741  *
4742  * Unlike in read, we don't preallocate a done event here.  Every time there
4743  * is a new connection we'll have to allocate a new one anyway, so we might
4744  * as well keep things simple rather than having to track them.
4745  */
4746 isc_result_t
4747 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4748         char strbuf[ISC_STRERRORSIZE];
4749
4750         REQUIRE(VALID_SOCKET(sock));
4751
4752         LOCK(&sock->lock);
4753
4754         REQUIRE(!sock->listener);
4755         REQUIRE(sock->bound);
4756         REQUIRE(sock->type == isc_sockettype_tcp ||
4757                 sock->type == isc_sockettype_unix);
4758
4759         if (backlog == 0)
4760                 backlog = SOMAXCONN;
4761
4762         if (listen(sock->fd, (int)backlog) < 0) {
4763                 UNLOCK(&sock->lock);
4764                 isc__strerror(errno, strbuf, sizeof(strbuf));
4765
4766                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4767
4768                 return (ISC_R_UNEXPECTED);
4769         }
4770
4771         sock->listener = 1;
4772
4773         UNLOCK(&sock->lock);
4774         return (ISC_R_SUCCESS);
4775 }
4776
4777 /*
4778  * This should try to do aggressive accept() XXXMLG
4779  */
4780 isc_result_t
4781 isc_socket_accept(isc_socket_t *sock,
4782                   isc_task_t *task, isc_taskaction_t action, const void *arg)
4783 {
4784         isc_socket_newconnev_t *dev;
4785         isc_socketmgr_t *manager;
4786         isc_task_t *ntask = NULL;
4787         isc_socket_t *nsock;
4788         isc_result_t result;
4789         isc_boolean_t do_poke = ISC_FALSE;
4790
4791         REQUIRE(VALID_SOCKET(sock));
4792         manager = sock->manager;
4793         REQUIRE(VALID_MANAGER(manager));
4794
4795         LOCK(&sock->lock);
4796
4797         REQUIRE(sock->listener);
4798
4799         /*
4800          * Sender field is overloaded here with the task we will be sending
4801          * this event to.  Just before the actual event is delivered the
4802          * actual ev_sender will be touched up to be the socket.
4803          */
4804         dev = (isc_socket_newconnev_t *)
4805                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
4806                                    action, arg, sizeof(*dev));
4807         if (dev == NULL) {
4808                 UNLOCK(&sock->lock);
4809                 return (ISC_R_NOMEMORY);
4810         }
4811         ISC_LINK_INIT(dev, ev_link);
4812
4813         result = allocate_socket(manager, sock->type, &nsock);
4814         if (result != ISC_R_SUCCESS) {
4815                 isc_event_free(ISC_EVENT_PTR(&dev));
4816                 UNLOCK(&sock->lock);
4817                 return (result);
4818         }
4819
4820         /*
4821          * Attach to socket and to task.
4822          */
4823         isc_task_attach(task, &ntask);
4824         nsock->references++;
4825         nsock->statsindex = sock->statsindex;
4826
4827         dev->ev_sender = ntask;
4828         dev->newsocket = nsock;
4829
4830         /*
4831          * Poke watcher here.  We still have the socket locked, so there
4832          * is no race condition.  We will keep the lock for such a short
4833          * bit of time waking it up now or later won't matter all that much.
4834          */
4835         if (ISC_LIST_EMPTY(sock->accept_list))
4836                 do_poke = ISC_TRUE;
4837
4838         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4839
4840         if (do_poke)
4841                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
4842
4843         UNLOCK(&sock->lock);
4844         return (ISC_R_SUCCESS);
4845 }
4846
4847 isc_result_t
4848 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
4849                    isc_task_t *task, isc_taskaction_t action, const void *arg)
4850 {
4851         isc_socket_connev_t *dev;
4852         isc_task_t *ntask = NULL;
4853         isc_socketmgr_t *manager;
4854         int cc;
4855         char strbuf[ISC_STRERRORSIZE];
4856         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
4857
4858         REQUIRE(VALID_SOCKET(sock));
4859         REQUIRE(addr != NULL);
4860         REQUIRE(task != NULL);
4861         REQUIRE(action != NULL);
4862
4863         manager = sock->manager;
4864         REQUIRE(VALID_MANAGER(manager));
4865         REQUIRE(addr != NULL);
4866
4867         if (isc_sockaddr_ismulticast(addr))
4868                 return (ISC_R_MULTICAST);
4869
4870         LOCK(&sock->lock);
4871
4872         REQUIRE(!sock->connecting);
4873
4874         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
4875                                                         ISC_SOCKEVENT_CONNECT,
4876                                                         action, arg,
4877                                                         sizeof(*dev));
4878         if (dev == NULL) {
4879                 UNLOCK(&sock->lock);
4880                 return (ISC_R_NOMEMORY);
4881         }
4882         ISC_LINK_INIT(dev, ev_link);
4883
4884         /*
4885          * Try to do the connect right away, as there can be only one
4886          * outstanding, and it might happen to complete.
4887          */
4888         sock->peer_address = *addr;
4889         cc = connect(sock->fd, &addr->type.sa, addr->length);
4890         if (cc < 0) {
4891                 /*
4892                  * HP-UX "fails" to connect a UDP socket and sets errno to
4893                  * EINPROGRESS if it's non-blocking.  We'd rather regard this as
4894                  * a success and let the user detect it if it's really an error
4895                  * at the time of sending a packet on the socket.
4896                  */
4897                 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4898                         cc = 0;
4899                         goto success;
4900                 }
4901                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
4902                         goto queue;
4903
4904                 switch (errno) {
4905 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
4906                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
4907                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4908                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4909                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4910                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4911 #ifdef EHOSTDOWN
4912                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4913 #endif
4914                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4915                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4916                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4917                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4918                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4919 #undef ERROR_MATCH
4920                 }
4921
4922                 sock->connected = 0;
4923
4924                 isc__strerror(errno, strbuf, sizeof(strbuf));
4925                 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
4926                 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
4927                                  addrbuf, errno, strbuf);
4928
4929                 UNLOCK(&sock->lock);
4930                 inc_stats(sock->manager->stats,
4931                           sock->statsindex[STATID_CONNECTFAIL]);
4932                 isc_event_free(ISC_EVENT_PTR(&dev));
4933                 return (ISC_R_UNEXPECTED);
4934
4935         err_exit:
4936                 sock->connected = 0;
4937                 isc_task_send(task, ISC_EVENT_PTR(&dev));
4938
4939                 UNLOCK(&sock->lock);
4940                 inc_stats(sock->manager->stats,
4941                           sock->statsindex[STATID_CONNECTFAIL]);
4942                 return (ISC_R_SUCCESS);
4943         }
4944
4945         /*
4946          * If connect completed, fire off the done event.
4947          */
4948  success:
4949         if (cc == 0) {
4950                 sock->connected = 1;
4951                 sock->bound = 1;
4952                 dev->result = ISC_R_SUCCESS;
4953                 isc_task_send(task, ISC_EVENT_PTR(&dev));
4954
4955                 UNLOCK(&sock->lock);
4956
4957                 inc_stats(sock->manager->stats,
4958                           sock->statsindex[STATID_CONNECT]);
4959
4960                 return (ISC_R_SUCCESS);
4961         }
4962
4963  queue:
4964
4965         /*
4966          * Attach to task.
4967          */
4968         isc_task_attach(task, &ntask);
4969
4970         sock->connecting = 1;
4971
4972         dev->ev_sender = ntask;
4973
4974         /*
4975          * Poke watcher here.  We still have the socket locked, so there
4976          * is no race condition.  We will keep the lock for such a short
4977          * bit of time waking it up now or later won't matter all that much.
4978          */
4979         if (sock->connect_ev == NULL)
4980                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
4981
4982         sock->connect_ev = dev;
4983
4984         UNLOCK(&sock->lock);
4985         return (ISC_R_SUCCESS);
4986 }
4987
4988 /*
4989  * Called when a socket with a pending connect() finishes.
4990  */
4991 static void
4992 internal_connect(isc_task_t *me, isc_event_t *ev) {
4993         isc_socket_t *sock;
4994         isc_socket_connev_t *dev;
4995         isc_task_t *task;
4996         int cc;
4997         ISC_SOCKADDR_LEN_T optlen;
4998         char strbuf[ISC_STRERRORSIZE];
4999         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5000
5001         UNUSED(me);
5002         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
5003
5004         sock = ev->ev_sender;
5005         INSIST(VALID_SOCKET(sock));
5006
5007         LOCK(&sock->lock);
5008
5009         /*
5010          * When the internal event was sent the reference count was bumped
5011          * to keep the socket around for us.  Decrement the count here.
5012          */
5013         INSIST(sock->references > 0);
5014         sock->references--;
5015         if (sock->references == 0) {
5016                 UNLOCK(&sock->lock);
5017                 destroy(&sock);
5018                 return;
5019         }
5020
5021         /*
5022          * Has this event been canceled?
5023          */
5024         dev = sock->connect_ev;
5025         if (dev == NULL) {
5026                 INSIST(!sock->connecting);
5027                 UNLOCK(&sock->lock);
5028                 return;
5029         }
5030
5031         INSIST(sock->connecting);
5032         sock->connecting = 0;
5033
5034         /*
5035          * Get any possible error status here.
5036          */
5037         optlen = sizeof(cc);
5038         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
5039                        (void *)&cc, (void *)&optlen) < 0)
5040                 cc = errno;
5041         else
5042                 errno = cc;
5043
5044         if (errno != 0) {
5045                 /*
5046                  * If the error is EAGAIN, just re-select on this
5047                  * fd and pretend nothing strange happened.
5048                  */
5049                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
5050                         sock->connecting = 1;
5051                         select_poke(sock->manager, sock->fd,
5052                                     SELECT_POKE_CONNECT);
5053                         UNLOCK(&sock->lock);
5054
5055                         return;
5056                 }
5057
5058                 inc_stats(sock->manager->stats,
5059                           sock->statsindex[STATID_CONNECTFAIL]);
5060
5061                 /*
5062                  * Translate other errors into ISC_R_* flavors.
5063                  */
5064                 switch (errno) {
5065 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
5066                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
5067                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5068                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5069                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5070                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5071 #ifdef EHOSTDOWN
5072                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5073 #endif
5074                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5075                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5076                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5077                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5078                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5079                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5080 #undef ERROR_MATCH
5081                 default:
5082                         dev->result = ISC_R_UNEXPECTED;
5083                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5084                                             sizeof(peerbuf));
5085                         isc__strerror(errno, strbuf, sizeof(strbuf));
5086                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5087                                          "internal_connect: connect(%s) %s",
5088                                          peerbuf, strbuf);
5089                 }
5090         } else {
5091                 inc_stats(sock->manager->stats,
5092                           sock->statsindex[STATID_CONNECT]);
5093                 dev->result = ISC_R_SUCCESS;
5094                 sock->connected = 1;
5095                 sock->bound = 1;
5096         }
5097
5098         sock->connect_ev = NULL;
5099
5100         UNLOCK(&sock->lock);
5101
5102         task = dev->ev_sender;
5103         dev->ev_sender = sock;
5104         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
5105 }
5106
5107 isc_result_t
5108 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5109         isc_result_t result;
5110
5111         REQUIRE(VALID_SOCKET(sock));
5112         REQUIRE(addressp != NULL);
5113
5114         LOCK(&sock->lock);
5115
5116         if (sock->connected) {
5117                 *addressp = sock->peer_address;
5118                 result = ISC_R_SUCCESS;
5119         } else {
5120                 result = ISC_R_NOTCONNECTED;
5121         }
5122
5123         UNLOCK(&sock->lock);
5124
5125         return (result);
5126 }
5127
5128 isc_result_t
5129 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5130         ISC_SOCKADDR_LEN_T len;
5131         isc_result_t result;
5132         char strbuf[ISC_STRERRORSIZE];
5133
5134         REQUIRE(VALID_SOCKET(sock));
5135         REQUIRE(addressp != NULL);
5136
5137         LOCK(&sock->lock);
5138
5139         if (!sock->bound) {
5140                 result = ISC_R_NOTBOUND;
5141                 goto out;
5142         }
5143
5144         result = ISC_R_SUCCESS;
5145
5146         len = sizeof(addressp->type);
5147         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5148                 isc__strerror(errno, strbuf, sizeof(strbuf));
5149                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
5150                                  strbuf);
5151                 result = ISC_R_UNEXPECTED;
5152                 goto out;
5153         }
5154         addressp->length = (unsigned int)len;
5155
5156  out:
5157         UNLOCK(&sock->lock);
5158
5159         return (result);
5160 }
5161
5162 /*
5163  * Run through the list of events on this socket, and cancel the ones
5164  * queued for task "task" of type "how".  "how" is a bitmask.
5165  */
5166 void
5167 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
5168
5169         REQUIRE(VALID_SOCKET(sock));
5170
5171         /*
5172          * Quick exit if there is nothing to do.  Don't even bother locking
5173          * in this case.
5174          */
5175         if (how == 0)
5176                 return;
5177
5178         LOCK(&sock->lock);
5179
5180         /*
5181          * All of these do the same thing, more or less.
5182          * Each will:
5183          *      o If the internal event is marked as "posted" try to
5184          *        remove it from the task's queue.  If this fails, mark it
5185          *        as canceled instead, and let the task clean it up later.
5186          *      o For each I/O request for that task of that type, post
5187          *        its done event with status of "ISC_R_CANCELED".
5188          *      o Reset any state needed.
5189          */
5190         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
5191             && !ISC_LIST_EMPTY(sock->recv_list)) {
5192                 isc_socketevent_t      *dev;
5193                 isc_socketevent_t      *next;
5194                 isc_task_t             *current_task;
5195
5196                 dev = ISC_LIST_HEAD(sock->recv_list);
5197
5198                 while (dev != NULL) {
5199                         current_task = dev->ev_sender;
5200                         next = ISC_LIST_NEXT(dev, ev_link);
5201
5202                         if ((task == NULL) || (task == current_task)) {
5203                                 dev->result = ISC_R_CANCELED;
5204                                 send_recvdone_event(sock, &dev);
5205                         }
5206                         dev = next;
5207                 }
5208         }
5209
5210         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
5211             && !ISC_LIST_EMPTY(sock->send_list)) {
5212                 isc_socketevent_t      *dev;
5213                 isc_socketevent_t      *next;
5214                 isc_task_t             *current_task;
5215
5216                 dev = ISC_LIST_HEAD(sock->send_list);
5217
5218                 while (dev != NULL) {
5219                         current_task = dev->ev_sender;
5220                         next = ISC_LIST_NEXT(dev, ev_link);
5221
5222                         if ((task == NULL) || (task == current_task)) {
5223                                 dev->result = ISC_R_CANCELED;
5224                                 send_senddone_event(sock, &dev);
5225                         }
5226                         dev = next;
5227                 }
5228         }
5229
5230         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
5231             && !ISC_LIST_EMPTY(sock->accept_list)) {
5232                 isc_socket_newconnev_t *dev;
5233                 isc_socket_newconnev_t *next;
5234                 isc_task_t             *current_task;
5235
5236                 dev = ISC_LIST_HEAD(sock->accept_list);
5237                 while (dev != NULL) {
5238                         current_task = dev->ev_sender;
5239                         next = ISC_LIST_NEXT(dev, ev_link);
5240
5241                         if ((task == NULL) || (task == current_task)) {
5242
5243                                 ISC_LIST_UNLINK(sock->accept_list, dev,
5244                                                 ev_link);
5245
5246                                 dev->newsocket->references--;
5247                                 free_socket(&dev->newsocket);
5248
5249                                 dev->result = ISC_R_CANCELED;
5250                                 dev->ev_sender = sock;
5251                                 isc_task_sendanddetach(&current_task,
5252                                                        ISC_EVENT_PTR(&dev));
5253                         }
5254
5255                         dev = next;
5256                 }
5257         }
5258
5259         /*
5260          * Connecting is not a list.
5261          */
5262         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
5263             && sock->connect_ev != NULL) {
5264                 isc_socket_connev_t    *dev;
5265                 isc_task_t             *current_task;
5266
5267                 INSIST(sock->connecting);
5268                 sock->connecting = 0;
5269
5270                 dev = sock->connect_ev;
5271                 current_task = dev->ev_sender;
5272
5273                 if ((task == NULL) || (task == current_task)) {
5274                         sock->connect_ev = NULL;
5275
5276                         dev->result = ISC_R_CANCELED;
5277                         dev->ev_sender = sock;
5278                         isc_task_sendanddetach(&current_task,
5279                                                ISC_EVENT_PTR(&dev));
5280                 }
5281         }
5282
5283         UNLOCK(&sock->lock);
5284 }
5285
5286 isc_sockettype_t
5287 isc_socket_gettype(isc_socket_t *sock) {
5288         REQUIRE(VALID_SOCKET(sock));
5289
5290         return (sock->type);
5291 }
5292
5293 isc_boolean_t
5294 isc_socket_isbound(isc_socket_t *sock) {
5295         isc_boolean_t val;
5296
5297         LOCK(&sock->lock);
5298         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
5299         UNLOCK(&sock->lock);
5300
5301         return (val);
5302 }
5303
5304 void
5305 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
5306 #if defined(IPV6_V6ONLY)
5307         int onoff = yes ? 1 : 0;
5308 #else
5309         UNUSED(yes);
5310         UNUSED(sock);
5311 #endif
5312
5313         REQUIRE(VALID_SOCKET(sock));
5314
5315 #ifdef IPV6_V6ONLY
5316         if (sock->pf == AF_INET6) {
5317                 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5318                                (void *)&onoff, sizeof(int)) < 0) {
5319                         char strbuf[ISC_STRERRORSIZE];
5320
5321                         UNEXPECTED_ERROR(__FILE__, __LINE__,
5322                                          "setsockopt(%d, IPV6_V6ONLY) "
5323                                          "%s: %s", sock->fd,
5324                                          isc_msgcat_get(isc_msgcat,
5325                                                         ISC_MSGSET_GENERAL,
5326                                                         ISC_MSG_FAILED,
5327                                                         "failed"),
5328                                          strbuf);
5329                 }
5330         }
5331         FIX_IPV6_RECVPKTINFO(sock);     /* AIX */
5332 #endif
5333 }
5334
5335 #ifndef ISC_PLATFORM_USETHREADS
5336 /* In our assumed scenario, we can simply use a single static object. */
5337 static isc_socketwait_t swait_private;
5338
5339 int
5340 isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) {
5341         int n;
5342 #ifdef USE_KQUEUE
5343         struct timespec ts, *tsp;
5344 #endif
5345 #ifdef USE_EPOLL
5346         int timeout;
5347 #endif
5348 #ifdef USE_DEVPOLL
5349         struct dvpoll dvp;
5350 #endif
5351
5352         REQUIRE(swaitp != NULL && *swaitp == NULL);
5353
5354         if (socketmgr == NULL)
5355                 return (0);
5356
5357 #ifdef USE_KQUEUE
5358         if (tvp != NULL) {
5359                 ts.tv_sec = tvp->tv_sec;
5360                 ts.tv_nsec = tvp->tv_usec * 1000;
5361                 tsp = &ts;
5362         } else
5363                 tsp = NULL;
5364         swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0,
5365                                        socketmgr->events, socketmgr->nevents,
5366                                        tsp);
5367         n = swait_private.nevents;
5368 #elif defined(USE_EPOLL)
5369         if (tvp != NULL)
5370                 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
5371         else
5372                 timeout = -1;
5373         swait_private.nevents = epoll_wait(socketmgr->epoll_fd,
5374                                            socketmgr->events,
5375                                            socketmgr->nevents, timeout);
5376         n = swait_private.nevents;
5377 #elif defined(USE_DEVPOLL)
5378         dvp.dp_fds = socketmgr->events;
5379         dvp.dp_nfds = socketmgr->nevents;
5380         if (tvp != NULL) {
5381                 dvp.dp_timeout = tvp->tv_sec * 1000 +
5382                         (tvp->tv_usec + 999) / 1000;
5383         } else
5384                 dvp.dp_timeout = -1;
5385         swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp);
5386         n = swait_private.nevents;
5387 #elif defined(USE_SELECT)
5388         memcpy(socketmgr->read_fds_copy, socketmgr->read_fds,
5389                socketmgr->fd_bufsize);
5390         memcpy(socketmgr->write_fds_copy, socketmgr->write_fds,
5391                socketmgr->fd_bufsize);
5392
5393         swait_private.readset = socketmgr->read_fds_copy;
5394         swait_private.writeset = socketmgr->write_fds_copy;
5395         swait_private.maxfd = socketmgr->maxfd + 1;
5396
5397         n = select(swait_private.maxfd, swait_private.readset,
5398                    swait_private.writeset, NULL, tvp);
5399 #endif
5400
5401         *swaitp = &swait_private;
5402         return (n);
5403 }
5404
5405 isc_result_t
5406 isc__socketmgr_dispatch(isc_socketwait_t *swait) {
5407         REQUIRE(swait == &swait_private);
5408
5409         if (socketmgr == NULL)
5410                 return (ISC_R_NOTFOUND);
5411
5412 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
5413         (void)process_fds(socketmgr, socketmgr->events, swait->nevents);
5414         return (ISC_R_SUCCESS);
5415 #elif defined(USE_SELECT)
5416         process_fds(socketmgr, swait->maxfd, swait->readset, swait->writeset);
5417         return (ISC_R_SUCCESS);
5418 #endif
5419 }
5420 #endif /* ISC_PLATFORM_USETHREADS */
5421
5422 void
5423 isc_socket_setname(isc_socket_t *socket, const char *name, void *tag) {
5424
5425         /*
5426          * Name 'socket'.
5427          */
5428
5429         REQUIRE(VALID_SOCKET(socket));
5430
5431         LOCK(&socket->lock);
5432         memset(socket->name, 0, sizeof(socket->name));
5433         strncpy(socket->name, name, sizeof(socket->name) - 1);
5434         socket->tag = tag;
5435         UNLOCK(&socket->lock);
5436 }
5437
5438 const char *
5439 isc_socket_getname(isc_socket_t *socket) {
5440         return (socket->name);
5441 }
5442
5443 void *
5444 isc_socket_gettag(isc_socket_t *socket) {
5445         return (socket->tag);
5446 }
5447
5448 #ifdef HAVE_LIBXML2
5449
5450 static const char *
5451 _socktype(isc_sockettype_t type)
5452 {
5453         if (type == isc_sockettype_udp)
5454                 return ("udp");
5455         else if (type == isc_sockettype_tcp)
5456                 return ("tcp");
5457         else if (type == isc_sockettype_unix)
5458                 return ("unix");
5459         else if (type == isc_sockettype_fdwatch)
5460                 return ("fdwatch");
5461         else
5462                 return ("not-initialized");
5463 }
5464
5465 void
5466 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer)
5467 {
5468         isc_socket_t *sock;
5469         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5470         isc_sockaddr_t addr;
5471         ISC_SOCKADDR_LEN_T len;
5472
5473         LOCK(&mgr->lock);
5474
5475 #ifndef ISC_PLATFORM_USETHREADS
5476         xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5477         xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
5478         xmlTextWriterEndElement(writer);
5479 #endif
5480
5481         xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
5482         sock = ISC_LIST_HEAD(mgr->socklist);
5483         while (sock != NULL) {
5484                 LOCK(&sock->lock);
5485                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
5486
5487                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
5488                 xmlTextWriterWriteFormatString(writer, "%p", sock);
5489                 xmlTextWriterEndElement(writer);
5490
5491                 if (sock->name[0] != 0) {
5492                         xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
5493                         xmlTextWriterWriteFormatString(writer, "%s",
5494                                                        sock->name);
5495                         xmlTextWriterEndElement(writer); /* name */
5496                 }
5497
5498                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5499                 xmlTextWriterWriteFormatString(writer, "%d", sock->references);
5500                 xmlTextWriterEndElement(writer);
5501
5502                 xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
5503                                           ISC_XMLCHAR _socktype(sock->type));
5504
5505                 if (sock->connected) {
5506                         isc_sockaddr_format(&sock->peer_address, peerbuf,
5507                                             sizeof(peerbuf));
5508                         xmlTextWriterWriteElement(writer,
5509                                                   ISC_XMLCHAR "peer-address",
5510                                                   ISC_XMLCHAR peerbuf);
5511                 }
5512
5513                 len = sizeof(addr);
5514                 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5515                         isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5516                         xmlTextWriterWriteElement(writer,
5517                                                   ISC_XMLCHAR "local-address",
5518                                                   ISC_XMLCHAR peerbuf);
5519                 }
5520
5521                 xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
5522                 if (sock->pending_recv)
5523                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5524                                                 ISC_XMLCHAR "pending-receive");
5525                 if (sock->pending_send)
5526                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5527                                                   ISC_XMLCHAR "pending-send");
5528                 if (sock->pending_accept)
5529                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5530                                                  ISC_XMLCHAR "pending_accept");
5531                 if (sock->listener)
5532                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5533                                                   ISC_XMLCHAR "listener");
5534                 if (sock->connected)
5535                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5536                                                   ISC_XMLCHAR "connected");
5537                 if (sock->connecting)
5538                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5539                                                   ISC_XMLCHAR "connecting");
5540                 if (sock->bound)
5541                         xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5542                                                   ISC_XMLCHAR "bound");
5543
5544                 xmlTextWriterEndElement(writer); /* states */
5545
5546                 xmlTextWriterEndElement(writer); /* socket */
5547
5548                 UNLOCK(&sock->lock);
5549                 sock = ISC_LIST_NEXT(sock, link);
5550         }
5551         xmlTextWriterEndElement(writer); /* sockets */
5552
5553         UNLOCK(&mgr->lock);
5554 }
5555 #endif /* HAVE_LIBXML2 */