Merge branch 'vendor/GCC44'
[dragonfly.git] / contrib / bind-9.3 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004-2006  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: socket.c,v 1.207.2.19.2.26 2006/05/19 02:53:36 marka Exp $ */
19
20 #include <config.h>
21
22 #include <sys/param.h>
23 #include <sys/types.h>
24 #include <sys/socket.h>
25 #include <sys/time.h>
26 #include <sys/uio.h>
27
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <stddef.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <unistd.h>
34
35 #include <isc/buffer.h>
36 #include <isc/bufferlist.h>
37 #include <isc/condition.h>
38 #include <isc/formatcheck.h>
39 #include <isc/list.h>
40 #include <isc/log.h>
41 #include <isc/mem.h>
42 #include <isc/msgs.h>
43 #include <isc/mutex.h>
44 #include <isc/net.h>
45 #include <isc/platform.h>
46 #include <isc/print.h>
47 #include <isc/region.h>
48 #include <isc/socket.h>
49 #include <isc/strerror.h>
50 #include <isc/task.h>
51 #include <isc/thread.h>
52 #include <isc/util.h>
53
54 #include "errno2result.h"
55
56 #ifndef ISC_PLATFORM_USETHREADS
57 #include "socket_p.h"
58 #endif /* ISC_PLATFORM_USETHREADS */
59
60 /*
61  * Some systems define the socket length argument as an int, some as size_t,
62  * some as socklen_t.  This is here so it can be easily changed if needed.
63  */
64 #ifndef ISC_SOCKADDR_LEN_T
65 #define ISC_SOCKADDR_LEN_T unsigned int
66 #endif
67
68 /*
69  * Define what the possible "soft" errors can be.  These are non-fatal returns
70  * of various network related functions, like recv() and so on.
71  *
72  * For some reason, BSDI (and perhaps others) will sometimes return <0
73  * from recv() but will have errno==0.  This is broken, but we have to
74  * work around it here.
75  */
76 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
77                          (e) == EWOULDBLOCK || \
78                          (e) == EINTR || \
79                          (e) == 0)
80
81 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
82
83 /*
84  * DLVL(90)  --  Function entry/exit and other tracing.
85  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
86  * DLVL(60)  --  Socket data send/receive
87  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
88  * DLVL(20)  --  Socket creation/destruction.
89  */
90 #define TRACE_LEVEL             90
91 #define CORRECTNESS_LEVEL       70
92 #define IOEVENT_LEVEL           60
93 #define EVENT_LEVEL             50
94 #define CREATION_LEVEL          20
95
96 #define TRACE           DLVL(TRACE_LEVEL)
97 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
98 #define IOEVENT         DLVL(IOEVENT_LEVEL)
99 #define EVENT           DLVL(EVENT_LEVEL)
100 #define CREATION        DLVL(CREATION_LEVEL)
101
102 typedef isc_event_t intev_t;
103
104 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
105 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
106
107 /*
108  * IPv6 control information.  If the socket is an IPv6 socket we want
109  * to collect the destination address and interface so the client can
110  * set them on outgoing packets.
111  */
112 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
113 #ifndef USE_CMSG
114 #define USE_CMSG        1
115 #endif
116 #endif
117
118 /*
119  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
120  * a setsockopt() like interface to request timestamps, and if the OS
121  * doesn't do it for us, call gettimeofday() on every UDP receive?
122  */
123 #ifdef SO_TIMESTAMP
124 #ifndef USE_CMSG
125 #define USE_CMSG        1
126 #endif
127 #endif
128
129 /*
130  * The number of times a send operation is repeated if the result is EINTR.
131  */
132 #define NRETRIES 10
133
134 struct isc_socket {
135         /* Not locked. */
136         unsigned int            magic;
137         isc_socketmgr_t        *manager;
138         isc_mutex_t             lock;
139         isc_sockettype_t        type;
140
141         /* Locked by socket lock. */
142         ISC_LINK(isc_socket_t)  link;
143         unsigned int            references;
144         int                     fd;
145         int                     pf;
146
147         ISC_LIST(isc_socketevent_t)             send_list;
148         ISC_LIST(isc_socketevent_t)             recv_list;
149         ISC_LIST(isc_socket_newconnev_t)        accept_list;
150         isc_socket_connev_t                    *connect_ev;
151
152         /*
153          * Internal events.  Posted when a descriptor is readable or
154          * writable.  These are statically allocated and never freed.
155          * They will be set to non-purgable before use.
156          */
157         intev_t                 readable_ev;
158         intev_t                 writable_ev;
159
160         isc_sockaddr_t          address;  /* remote address */
161
162         unsigned int            pending_recv : 1,
163                                 pending_send : 1,
164                                 pending_accept : 1,
165                                 listener : 1, /* listener socket */
166                                 connected : 1,
167                                 connecting : 1, /* connect pending */
168                                 bound : 1; /* bound to local addr */
169
170 #ifdef ISC_NET_RECVOVERFLOW
171         unsigned char           overflow; /* used for MSG_TRUNC fake */
172 #endif
173
174         char                    *recvcmsgbuf;
175         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
176         char                    *sendcmsgbuf;
177         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
178 };
179
180 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
181 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
182
183 struct isc_socketmgr {
184         /* Not locked. */
185         unsigned int            magic;
186         isc_mem_t              *mctx;
187         isc_mutex_t             lock;
188         /* Locked by manager lock. */
189         ISC_LIST(isc_socket_t)  socklist;
190         fd_set                  read_fds;
191         fd_set                  write_fds;
192         isc_socket_t           *fds[FD_SETSIZE];
193         int                     fdstate[FD_SETSIZE];
194         int                     maxfd;
195 #ifdef ISC_PLATFORM_USETHREADS
196         isc_thread_t            watcher;
197         isc_condition_t         shutdown_ok;
198         int                     pipe_fds[2];
199 #else /* ISC_PLATFORM_USETHREADS */
200         unsigned int            refs;
201 #endif /* ISC_PLATFORM_USETHREADS */
202 };
203
204 #ifndef ISC_PLATFORM_USETHREADS
205 static isc_socketmgr_t *socketmgr = NULL;
206 #endif /* ISC_PLATFORM_USETHREADS */
207
208 #define CLOSED          0       /* this one must be zero */
209 #define MANAGED         1
210 #define CLOSE_PENDING   2
211
212 /*
213  * send() and recv() iovec counts
214  */
215 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
216 #ifdef ISC_NET_RECVOVERFLOW
217 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
218 #else
219 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
220 #endif
221
222 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
223 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
224 static void free_socket(isc_socket_t **);
225 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
226                                     isc_socket_t **);
227 static void destroy(isc_socket_t **);
228 static void internal_accept(isc_task_t *, isc_event_t *);
229 static void internal_connect(isc_task_t *, isc_event_t *);
230 static void internal_recv(isc_task_t *, isc_event_t *);
231 static void internal_send(isc_task_t *, isc_event_t *);
232 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
233 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
234                               struct msghdr *, struct iovec *, size_t *);
235 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
236                               struct msghdr *, struct iovec *, size_t *);
237
238 #define SELECT_POKE_SHUTDOWN            (-1)
239 #define SELECT_POKE_NOTHING             (-2)
240 #define SELECT_POKE_READ                (-3)
241 #define SELECT_POKE_ACCEPT              (-3) /* Same as _READ */
242 #define SELECT_POKE_WRITE               (-4)
243 #define SELECT_POKE_CONNECT             (-4) /* Same as _WRITE */
244 #define SELECT_POKE_CLOSE               (-5)
245
246 #define SOCK_DEAD(s)                    ((s)->references == 0)
247
248 static void
249 manager_log(isc_socketmgr_t *sockmgr,
250             isc_logcategory_t *category, isc_logmodule_t *module, int level,
251             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
252 static void
253 manager_log(isc_socketmgr_t *sockmgr,
254             isc_logcategory_t *category, isc_logmodule_t *module, int level,
255             const char *fmt, ...)
256 {
257         char msgbuf[2048];
258         va_list ap;
259
260         if (! isc_log_wouldlog(isc_lctx, level))
261                 return;
262
263         va_start(ap, fmt);
264         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
265         va_end(ap);
266
267         isc_log_write(isc_lctx, category, module, level,
268                       "sockmgr %p: %s", sockmgr, msgbuf);
269 }
270
271 static void
272 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
273            isc_logcategory_t *category, isc_logmodule_t *module, int level,
274            isc_msgcat_t *msgcat, int msgset, int message,
275            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
276 static void
277 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
278            isc_logcategory_t *category, isc_logmodule_t *module, int level,
279            isc_msgcat_t *msgcat, int msgset, int message,
280            const char *fmt, ...)
281 {
282         char msgbuf[2048];
283         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
284         va_list ap;
285
286         if (! isc_log_wouldlog(isc_lctx, level))
287                 return;
288
289         va_start(ap, fmt);
290         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
291         va_end(ap);
292
293         if (address == NULL) {
294                 isc_log_iwrite(isc_lctx, category, module, level,
295                                msgcat, msgset, message,
296                                "socket %p: %s", sock, msgbuf);
297         } else {
298                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
299                 isc_log_iwrite(isc_lctx, category, module, level,
300                                msgcat, msgset, message,
301                                "socket %p %s: %s", sock, peerbuf, msgbuf);
302         }
303 }
304
305 static void
306 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
307         isc_socket_t *sock;
308
309         /*
310          * This is a wakeup on a socket.  If the socket is not in the
311          * process of being closed, start watching it for either reads
312          * or writes.
313          */
314
315         INSIST(fd >= 0 && fd < (int)FD_SETSIZE);
316
317         if (manager->fdstate[fd] == CLOSE_PENDING) {
318                 manager->fdstate[fd] = CLOSED;
319                 FD_CLR(fd, &manager->read_fds);
320                 FD_CLR(fd, &manager->write_fds);
321                 (void)close(fd);
322                 return;
323         }
324         if (manager->fdstate[fd] != MANAGED)
325                 return;
326
327         sock = manager->fds[fd];
328
329         /*
330          * Set requested bit.
331          */
332         if (msg == SELECT_POKE_READ)
333                 FD_SET(sock->fd, &manager->read_fds);
334         if (msg == SELECT_POKE_WRITE)
335                 FD_SET(sock->fd, &manager->write_fds);
336 }
337
338 #ifdef ISC_PLATFORM_USETHREADS
339 /*
340  * Poke the select loop when there is something for us to do.
341  * The write is required (by POSIX) to complete.  That is, we
342  * will not get partial writes.
343  */
344 static void
345 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
346         int cc;
347         int buf[2];
348         char strbuf[ISC_STRERRORSIZE];
349
350         buf[0] = fd;
351         buf[1] = msg;
352
353         do {
354                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
355 #ifdef ENOSR
356                 /*
357                  * Treat ENOSR as EAGAIN but loop slowly as it is
358                  * unlikely to clear fast.
359                  */
360                 if (cc < 0 && errno == ENOSR) {
361                         sleep(1);
362                         errno = EAGAIN;
363                 }
364 #endif
365         } while (cc < 0 && SOFT_ERROR(errno));
366
367         if (cc < 0) {
368                 isc__strerror(errno, strbuf, sizeof(strbuf));
369                 FATAL_ERROR(__FILE__, __LINE__,
370                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
371                                            ISC_MSG_WRITEFAILED,
372                                            "write() failed "
373                                            "during watcher poke: %s"),
374                             strbuf);
375         }
376
377         INSIST(cc == sizeof(buf));
378 }
379
380 /*
381  * Read a message on the internal fd.
382  */
383 static void
384 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
385         int buf[2];
386         int cc;
387         char strbuf[ISC_STRERRORSIZE];
388
389         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
390         if (cc < 0) {
391                 *msg = SELECT_POKE_NOTHING;
392                 *fd = -1;       /* Silence compiler. */
393                 if (SOFT_ERROR(errno))
394                         return;
395
396                 isc__strerror(errno, strbuf, sizeof(strbuf));
397                 FATAL_ERROR(__FILE__, __LINE__,
398                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
399                                            ISC_MSG_READFAILED,
400                                            "read() failed "
401                                            "during watcher poke: %s"),
402                             strbuf);
403                 
404                 return;
405         }
406         INSIST(cc == sizeof(buf));
407
408         *fd = buf[0];
409         *msg = buf[1];
410 }
411 #else /* ISC_PLATFORM_USETHREADS */
412 /*
413  * Update the state of the socketmgr when something changes.
414  */
415 static void
416 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
417         if (msg == SELECT_POKE_SHUTDOWN)
418                 return;
419         else if (fd >= 0)
420                 wakeup_socket(manager, fd, msg);
421         return;
422 }
423 #endif /* ISC_PLATFORM_USETHREADS */
424
425 /*
426  * Make a fd non-blocking.
427  */
428 static isc_result_t
429 make_nonblock(int fd) {
430         int ret;
431         int flags;
432         char strbuf[ISC_STRERRORSIZE];
433 #ifdef USE_FIONBIO_IOCTL
434         int on = 1;
435
436         ret = ioctl(fd, FIONBIO, (char *)&on);
437 #else
438         flags = fcntl(fd, F_GETFL, 0);
439         flags |= PORT_NONBLOCK;
440         ret = fcntl(fd, F_SETFL, flags);
441 #endif
442
443         if (ret == -1) {
444                 isc__strerror(errno, strbuf, sizeof(strbuf));
445                 UNEXPECTED_ERROR(__FILE__, __LINE__,
446 #ifdef USE_FIONBIO_IOCTL
447                                  "ioctl(%d, FIONBIO, &on): %s", fd,
448 #else
449                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
450 #endif
451                                  strbuf);
452
453                 return (ISC_R_UNEXPECTED);
454         }
455
456         return (ISC_R_SUCCESS);
457 }
458
459 #ifdef USE_CMSG
460 /*
461  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
462  * In order to ensure as much portability as possible, we provide wrapper
463  * functions of these macros.
464  * Note that cmsg_space() could run slow on OSes that do not have
465  * CMSG_SPACE.
466  */
467 static inline ISC_SOCKADDR_LEN_T
468 cmsg_len(ISC_SOCKADDR_LEN_T len) {
469 #ifdef CMSG_LEN
470         return (CMSG_LEN(len));
471 #else
472         ISC_SOCKADDR_LEN_T hdrlen;
473
474         /*
475          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
476          * is correct.
477          */
478         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
479         return (hdrlen + len);
480 #endif
481 }
482
483 static inline ISC_SOCKADDR_LEN_T
484 cmsg_space(ISC_SOCKADDR_LEN_T len) {
485 #ifdef CMSG_SPACE
486         return (CMSG_SPACE(len));
487 #else
488         struct msghdr msg;
489         struct cmsghdr *cmsgp;
490         /*
491          * XXX: The buffer length is an ad-hoc value, but should be enough
492          * in a practical sense.
493          */
494         char dummybuf[sizeof(struct cmsghdr) + 1024];
495
496         memset(&msg, 0, sizeof(msg));
497         msg.msg_control = dummybuf;
498         msg.msg_controllen = sizeof(dummybuf);
499
500         cmsgp = (struct cmsghdr *)dummybuf;
501         cmsgp->cmsg_len = cmsg_len(len);
502
503         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
504         if (cmsgp != NULL)
505                 return ((char *)cmsgp - (char *)msg.msg_control);
506         else
507                 return (0);
508 #endif  
509 }
510 #endif /* USE_CMSG */
511
512 /*
513  * Process control messages received on a socket.
514  */
515 static void
516 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
517 #ifdef USE_CMSG
518         struct cmsghdr *cmsgp;
519 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
520         struct in6_pktinfo *pktinfop;
521 #endif
522 #ifdef SO_TIMESTAMP
523         struct timeval *timevalp;
524 #endif
525 #endif
526
527         /*
528          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
529          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
530          * They are all here, outside of the CPP tests, because it is
531          * more consistent with the usual ISC coding style.
532          */
533         UNUSED(sock);
534         UNUSED(msg);
535         UNUSED(dev);
536
537 #ifdef ISC_NET_BSD44MSGHDR
538
539 #ifdef MSG_TRUNC
540         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
541                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
542 #endif
543
544 #ifdef MSG_CTRUNC
545         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
546                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
547 #endif
548
549 #ifndef USE_CMSG
550         return;
551 #else
552         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
553                 return;
554
555 #ifdef SO_TIMESTAMP
556         timevalp = NULL;
557 #endif
558 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
559         pktinfop = NULL;
560 #endif
561
562         cmsgp = CMSG_FIRSTHDR(msg);
563         while (cmsgp != NULL) {
564                 socket_log(sock, NULL, TRACE,
565                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
566                            "processing cmsg %p", cmsgp);
567
568 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
569                 if (cmsgp->cmsg_level == IPPROTO_IPV6
570                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
571
572                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
573                         memcpy(&dev->pktinfo, pktinfop,
574                                sizeof(struct in6_pktinfo));
575                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
576                         socket_log(sock, NULL, TRACE,
577                                    isc_msgcat, ISC_MSGSET_SOCKET,
578                                    ISC_MSG_IFRECEIVED,
579                                    "interface received on ifindex %u",
580                                    dev->pktinfo.ipi6_ifindex);
581                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
582                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;                         
583                         goto next;
584                 }
585 #endif
586
587 #ifdef SO_TIMESTAMP
588                 if (cmsgp->cmsg_level == SOL_SOCKET
589                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
590                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
591                         dev->timestamp.seconds = timevalp->tv_sec;
592                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
593                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
594                         goto next;
595                 }
596 #endif
597
598         next:
599                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
600         }
601 #endif /* USE_CMSG */
602
603 #endif /* ISC_NET_BSD44MSGHDR */
604 }
605
606 /*
607  * Construct an iov array and attach it to the msghdr passed in.  This is
608  * the SEND constructor, which will use the used region of the buffer
609  * (if using a buffer list) or will use the internal region (if a single
610  * buffer I/O is requested).
611  *
612  * Nothing can be NULL, and the done event must list at least one buffer
613  * on the buffer linked list for this function to be meaningful.
614  *
615  * If write_countp != NULL, *write_countp will hold the number of bytes
616  * this transaction can send.
617  */
618 static void
619 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
620                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
621 {
622         unsigned int iovcount;
623         isc_buffer_t *buffer;
624         isc_region_t used;
625         size_t write_count;
626         size_t skip_count;
627
628         memset(msg, 0, sizeof(*msg));
629
630         if (sock->type == isc_sockettype_udp) {
631                 msg->msg_name = (void *)&dev->address.type.sa;
632                 msg->msg_namelen = dev->address.length;
633         } else {
634                 msg->msg_name = NULL;
635                 msg->msg_namelen = 0;
636         }
637
638         buffer = ISC_LIST_HEAD(dev->bufferlist);
639         write_count = 0;
640         iovcount = 0;
641
642         /*
643          * Single buffer I/O?  Skip what we've done so far in this region.
644          */
645         if (buffer == NULL) {
646                 write_count = dev->region.length - dev->n;
647                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
648                 iov[0].iov_len = write_count;
649                 iovcount = 1;
650
651                 goto config;
652         }
653
654         /*
655          * Multibuffer I/O.
656          * Skip the data in the buffer list that we have already written.
657          */
658         skip_count = dev->n;
659         while (buffer != NULL) {
660                 REQUIRE(ISC_BUFFER_VALID(buffer));
661                 if (skip_count < isc_buffer_usedlength(buffer))
662                         break;
663                 skip_count -= isc_buffer_usedlength(buffer);
664                 buffer = ISC_LIST_NEXT(buffer, link);
665         }
666
667         while (buffer != NULL) {
668                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
669
670                 isc_buffer_usedregion(buffer, &used);
671
672                 if (used.length > 0) {
673                         iov[iovcount].iov_base = (void *)(used.base
674                                                           + skip_count);
675                         iov[iovcount].iov_len = used.length - skip_count;
676                         write_count += (used.length - skip_count);
677                         skip_count = 0;
678                         iovcount++;
679                 }
680                 buffer = ISC_LIST_NEXT(buffer, link);
681         }
682
683         INSIST(skip_count == 0U);
684
685  config:
686         msg->msg_iov = iov;
687         msg->msg_iovlen = iovcount;
688
689 #ifdef ISC_NET_BSD44MSGHDR
690         msg->msg_control = NULL;
691         msg->msg_controllen = 0;
692         msg->msg_flags = 0;
693 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
694         if ((sock->type == isc_sockettype_udp)
695             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
696                 struct cmsghdr *cmsgp;
697                 struct in6_pktinfo *pktinfop;
698
699                 socket_log(sock, NULL, TRACE,
700                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
701                            "sendto pktinfo data, ifindex %u",
702                            dev->pktinfo.ipi6_ifindex);
703
704                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
705                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
706                 msg->msg_control = (void *)sock->sendcmsgbuf;
707
708                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
709                 cmsgp->cmsg_level = IPPROTO_IPV6;
710                 cmsgp->cmsg_type = IPV6_PKTINFO;
711                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
712                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
713                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
714         }
715 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
716 #else /* ISC_NET_BSD44MSGHDR */
717         msg->msg_accrights = NULL;
718         msg->msg_accrightslen = 0;
719 #endif /* ISC_NET_BSD44MSGHDR */
720
721         if (write_countp != NULL)
722                 *write_countp = write_count;
723 }
724
725 /*
726  * Construct an iov array and attach it to the msghdr passed in.  This is
727  * the RECV constructor, which will use the avialable region of the buffer
728  * (if using a buffer list) or will use the internal region (if a single
729  * buffer I/O is requested).
730  *
731  * Nothing can be NULL, and the done event must list at least one buffer
732  * on the buffer linked list for this function to be meaningful.
733  *
734  * If read_countp != NULL, *read_countp will hold the number of bytes
735  * this transaction can receive.
736  */
737 static void
738 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
739                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
740 {
741         unsigned int iovcount;
742         isc_buffer_t *buffer;
743         isc_region_t available;
744         size_t read_count;
745
746         memset(msg, 0, sizeof(struct msghdr));
747
748         if (sock->type == isc_sockettype_udp) {
749                 memset(&dev->address, 0, sizeof(dev->address));
750 #ifdef BROKEN_RECVMSG
751                 if (sock->pf == AF_INET) {
752                         msg->msg_name = (void *)&dev->address.type.sin;
753                         msg->msg_namelen = sizeof(dev->address.type.sin6);
754                 } else if (sock->pf == AF_INET6) {
755                         msg->msg_name = (void *)&dev->address.type.sin6;
756                         msg->msg_namelen = sizeof(dev->address.type.sin6);
757 #ifdef ISC_PLATFORM_HAVESYSUNH
758                 } else if (sock->pf == AF_UNIX) {
759                         msg->msg_name = (void *)&dev->address.type.sunix;
760                         msg->msg_namelen = sizeof(dev->address.type.sunix);
761 #endif
762                 } else {
763                         msg->msg_name = (void *)&dev->address.type.sa;
764                         msg->msg_namelen = sizeof(dev->address.type);
765                 }
766 #else
767                 msg->msg_name = (void *)&dev->address.type.sa;
768                 msg->msg_namelen = sizeof(dev->address.type);
769 #endif
770 #ifdef ISC_NET_RECVOVERFLOW
771                 /* If needed, steal one iovec for overflow detection. */
772                 maxiov--;
773 #endif
774         } else { /* TCP */
775                 msg->msg_name = NULL;
776                 msg->msg_namelen = 0;
777                 dev->address = sock->address;
778         }
779
780         buffer = ISC_LIST_HEAD(dev->bufferlist);
781         read_count = 0;
782
783         /*
784          * Single buffer I/O?  Skip what we've done so far in this region.
785          */
786         if (buffer == NULL) {
787                 read_count = dev->region.length - dev->n;
788                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
789                 iov[0].iov_len = read_count;
790                 iovcount = 1;
791
792                 goto config;
793         }
794
795         /*
796          * Multibuffer I/O.
797          * Skip empty buffers.
798          */
799         while (buffer != NULL) {
800                 REQUIRE(ISC_BUFFER_VALID(buffer));
801                 if (isc_buffer_availablelength(buffer) != 0)
802                         break;
803                 buffer = ISC_LIST_NEXT(buffer, link);
804         }
805
806         iovcount = 0;
807         while (buffer != NULL) {
808                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
809
810                 isc_buffer_availableregion(buffer, &available);
811
812                 if (available.length > 0) {
813                         iov[iovcount].iov_base = (void *)(available.base);
814                         iov[iovcount].iov_len = available.length;
815                         read_count += available.length;
816                         iovcount++;
817                 }
818                 buffer = ISC_LIST_NEXT(buffer, link);
819         }
820
821  config:
822
823         /*
824          * If needed, set up to receive that one extra byte.  Note that
825          * we know there is at least one iov left, since we stole it
826          * at the top of this function.
827          */
828 #ifdef ISC_NET_RECVOVERFLOW
829         if (sock->type == isc_sockettype_udp) {
830                 iov[iovcount].iov_base = (void *)(&sock->overflow);
831                 iov[iovcount].iov_len = 1;
832                 iovcount++;
833         }
834 #endif
835
836         msg->msg_iov = iov;
837         msg->msg_iovlen = iovcount;
838
839 #ifdef ISC_NET_BSD44MSGHDR
840         msg->msg_control = NULL;
841         msg->msg_controllen = 0;
842         msg->msg_flags = 0;
843 #if defined(USE_CMSG)
844         if (sock->type == isc_sockettype_udp) {
845                 msg->msg_control = sock->recvcmsgbuf;
846                 msg->msg_controllen = sock->recvcmsgbuflen;
847         }
848 #endif /* USE_CMSG */
849 #else /* ISC_NET_BSD44MSGHDR */
850         msg->msg_accrights = NULL;
851         msg->msg_accrightslen = 0;
852 #endif /* ISC_NET_BSD44MSGHDR */
853
854         if (read_countp != NULL)
855                 *read_countp = read_count;
856 }
857
858 static void
859 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
860                 isc_socketevent_t *dev)
861 {
862         if (sock->type == isc_sockettype_udp) {
863                 if (address != NULL)
864                         dev->address = *address;
865                 else
866                         dev->address = sock->address;
867         } else if (sock->type == isc_sockettype_tcp) {
868                 INSIST(address == NULL);
869                 dev->address = sock->address;
870         }
871 }
872
873 static isc_socketevent_t *
874 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
875                      isc_taskaction_t action, const void *arg)
876 {
877         isc_socketevent_t *ev;
878
879         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
880                                                      sock, eventtype,
881                                                      action, arg,
882                                                      sizeof(*ev));
883
884         if (ev == NULL)
885                 return (NULL);
886
887         ev->result = ISC_R_UNEXPECTED;
888         ISC_LINK_INIT(ev, ev_link);
889         ISC_LIST_INIT(ev->bufferlist);
890         ev->region.base = NULL;
891         ev->n = 0;
892         ev->offset = 0;
893         ev->attributes = 0;
894
895         return (ev);
896 }
897
898 #if defined(ISC_SOCKET_DEBUG)
899 static void
900 dump_msg(struct msghdr *msg) {
901         unsigned int i;
902
903         printf("MSGHDR %p\n", msg);
904         printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
905         printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
906         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
907                 printf("\t\t%d\tbase %p, len %d\n", i,
908                        msg->msg_iov[i].iov_base,
909                        msg->msg_iov[i].iov_len);
910 #ifdef ISC_NET_BSD44MSGHDR
911         printf("\tcontrol %p, controllen %d\n", msg->msg_control,
912                msg->msg_controllen);
913 #endif
914 }
915 #endif
916
917 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
918 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
919 #define DOIO_HARD               2       /* i/o error, event sent */
920 #define DOIO_EOF                3       /* EOF, no event sent */
921
922 static int
923 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
924         int cc;
925         struct iovec iov[MAXSCATTERGATHER_RECV];
926         size_t read_count;
927         size_t actual_count;
928         struct msghdr msghdr;
929         isc_buffer_t *buffer;
930         int recv_errno;
931         char strbuf[ISC_STRERRORSIZE];
932
933         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
934
935 #if defined(ISC_SOCKET_DEBUG)
936         dump_msg(&msghdr);
937 #endif
938
939         cc = recvmsg(sock->fd, &msghdr, 0);
940         recv_errno = errno;
941
942 #if defined(ISC_SOCKET_DEBUG)
943         dump_msg(&msghdr);
944 #endif
945
946         if (cc < 0) {
947                 if (SOFT_ERROR(recv_errno))
948                         return (DOIO_SOFT);
949
950                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
951                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
952                         socket_log(sock, NULL, IOEVENT,
953                                    isc_msgcat, ISC_MSGSET_SOCKET,
954                                    ISC_MSG_DOIORECV, 
955                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
956                                    sock->fd, cc, recv_errno, strbuf);
957                 }
958
959 #define SOFT_OR_HARD(_system, _isc) \
960         if (recv_errno == _system) { \
961                 if (sock->connected) { \
962                         dev->result = _isc; \
963                         return (DOIO_HARD); \
964                 } \
965                 return (DOIO_SOFT); \
966         }
967 #define ALWAYS_HARD(_system, _isc) \
968         if (recv_errno == _system) { \
969                 dev->result = _isc; \
970                 return (DOIO_HARD); \
971         }
972
973                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
974                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
975                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
976                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
977                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
978                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
979                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
980
981 #undef SOFT_OR_HARD
982 #undef ALWAYS_HARD
983
984                 dev->result = isc__errno2result(recv_errno);
985                 return (DOIO_HARD);
986         }
987
988         /*
989          * On TCP, zero length reads indicate EOF, while on
990          * UDP, zero length reads are perfectly valid, although
991          * strange.
992          */
993         if ((sock->type == isc_sockettype_tcp) && (cc == 0))
994                 return (DOIO_EOF);
995
996         if (sock->type == isc_sockettype_udp) {
997                 dev->address.length = msghdr.msg_namelen;
998                 if (isc_sockaddr_getport(&dev->address) == 0) {
999                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1000                                 socket_log(sock, &dev->address, IOEVENT,
1001                                            isc_msgcat, ISC_MSGSET_SOCKET,
1002                                            ISC_MSG_ZEROPORT, 
1003                                            "dropping source port zero packet");
1004                         }
1005                         return (DOIO_SOFT);
1006                 }
1007         }
1008
1009         socket_log(sock, &dev->address, IOEVENT,
1010                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1011                    "packet received correctly");
1012
1013         /*
1014          * Overflow bit detection.  If we received MORE bytes than we should,
1015          * this indicates an overflow situation.  Set the flag in the
1016          * dev entry and adjust how much we read by one.
1017          */
1018 #ifdef ISC_NET_RECVOVERFLOW
1019         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1020                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1021                 cc--;
1022         }
1023 #endif
1024
1025         /*
1026          * If there are control messages attached, run through them and pull
1027          * out the interesting bits.
1028          */
1029         if (sock->type == isc_sockettype_udp)
1030                 process_cmsg(sock, &msghdr, dev);
1031
1032         /*
1033          * update the buffers (if any) and the i/o count
1034          */
1035         dev->n += cc;
1036         actual_count = cc;
1037         buffer = ISC_LIST_HEAD(dev->bufferlist);
1038         while (buffer != NULL && actual_count > 0U) {
1039                 REQUIRE(ISC_BUFFER_VALID(buffer));
1040                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1041                         actual_count -= isc_buffer_availablelength(buffer);
1042                         isc_buffer_add(buffer,
1043                                        isc_buffer_availablelength(buffer));
1044                 } else {
1045                         isc_buffer_add(buffer, actual_count);
1046                         actual_count = 0;
1047                         break;
1048                 }
1049                 buffer = ISC_LIST_NEXT(buffer, link);
1050                 if (buffer == NULL) {
1051                         INSIST(actual_count == 0U);
1052                 }
1053         }
1054
1055         /*
1056          * If we read less than we expected, update counters,
1057          * and let the upper layer poke the descriptor.
1058          */
1059         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1060                 return (DOIO_SOFT);
1061
1062         /*
1063          * Full reads are posted, or partials if partials are ok.
1064          */
1065         dev->result = ISC_R_SUCCESS;
1066         return (DOIO_SUCCESS);
1067 }
1068
1069 /*
1070  * Returns:
1071  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1072  *                      ISC_R_SUCCESS.
1073  *
1074  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1075  *                      dev->result contains the appropriate error.
1076  *
1077  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1078  *                      event was sent.  The operation should be retried.
1079  *
1080  *      No other return values are possible.
1081  */
1082 static int
1083 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1084         int cc;
1085         struct iovec iov[MAXSCATTERGATHER_SEND];
1086         size_t write_count;
1087         struct msghdr msghdr;
1088         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1089         int attempts = 0;
1090         int send_errno;
1091         char strbuf[ISC_STRERRORSIZE];
1092
1093         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1094
1095  resend:
1096         cc = sendmsg(sock->fd, &msghdr, 0);
1097         send_errno = errno;
1098
1099         /*
1100          * Check for error or block condition.
1101          */
1102         if (cc < 0) {
1103                 if (send_errno == EINTR && ++attempts < NRETRIES)
1104                         goto resend;
1105
1106                 if (SOFT_ERROR(send_errno))
1107                         return (DOIO_SOFT);
1108
1109 #define SOFT_OR_HARD(_system, _isc) \
1110         if (send_errno == _system) { \
1111                 if (sock->connected) { \
1112                         dev->result = _isc; \
1113                         return (DOIO_HARD); \
1114                 } \
1115                 return (DOIO_SOFT); \
1116         }
1117 #define ALWAYS_HARD(_system, _isc) \
1118         if (send_errno == _system) { \
1119                 dev->result = _isc; \
1120                 return (DOIO_HARD); \
1121         }
1122
1123                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1124                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1125                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1126                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1127                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1128 #ifdef EHOSTDOWN
1129                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1130 #endif
1131                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1132                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1133                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1134                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1135                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1136
1137 #undef SOFT_OR_HARD
1138 #undef ALWAYS_HARD
1139
1140                 /*
1141                  * The other error types depend on whether or not the
1142                  * socket is UDP or TCP.  If it is UDP, some errors
1143                  * that we expect to be fatal under TCP are merely
1144                  * annoying, and are really soft errors.
1145                  *
1146                  * However, these soft errors are still returned as
1147                  * a status.
1148                  */
1149                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1150                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1151                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1152                                  addrbuf, strbuf);
1153                 dev->result = isc__errno2result(send_errno);
1154                 return (DOIO_HARD);
1155         }
1156
1157         if (cc == 0)
1158                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1159                                  "internal_send: send() %s 0",
1160                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1161                                                 ISC_MSG_RETURNED, "returned"));
1162
1163         /*
1164          * If we write less than we expected, update counters, poke.
1165          */
1166         dev->n += cc;
1167         if ((size_t)cc != write_count)
1168                 return (DOIO_SOFT);
1169
1170         /*
1171          * Exactly what we wanted to write.  We're done with this
1172          * entry.  Post its completion event.
1173          */
1174         dev->result = ISC_R_SUCCESS;
1175         return (DOIO_SUCCESS);
1176 }
1177
1178 /*
1179  * Kill.
1180  *
1181  * Caller must ensure that the socket is not locked and no external
1182  * references exist.
1183  */
1184 static void
1185 destroy(isc_socket_t **sockp) {
1186         isc_socket_t *sock = *sockp;
1187         isc_socketmgr_t *manager = sock->manager;
1188
1189         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1190                    ISC_MSG_DESTROYING, "destroying");
1191
1192         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1193         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1194         INSIST(ISC_LIST_EMPTY(sock->send_list));
1195         INSIST(sock->connect_ev == NULL);
1196         REQUIRE(sock->fd >= 0 && sock->fd < (int)FD_SETSIZE);
1197
1198         LOCK(&manager->lock);
1199
1200         /*
1201          * No one has this socket open, so the watcher doesn't have to be
1202          * poked, and the socket doesn't have to be locked.
1203          */
1204         manager->fds[sock->fd] = NULL;
1205         manager->fdstate[sock->fd] = CLOSE_PENDING;
1206         select_poke(manager, sock->fd, SELECT_POKE_CLOSE);
1207         ISC_LIST_UNLINK(manager->socklist, sock, link);
1208
1209 #ifdef ISC_PLATFORM_USETHREADS
1210         if (ISC_LIST_EMPTY(manager->socklist))
1211                 SIGNAL(&manager->shutdown_ok);
1212 #endif /* ISC_PLATFORM_USETHREADS */
1213
1214         /*
1215          * XXX should reset manager->maxfd here
1216          */
1217
1218         UNLOCK(&manager->lock);
1219
1220         free_socket(sockp);
1221 }
1222
1223 static isc_result_t
1224 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1225                 isc_socket_t **socketp)
1226 {
1227         isc_socket_t *sock;
1228         isc_result_t ret;
1229         ISC_SOCKADDR_LEN_T cmsgbuflen;
1230
1231         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1232
1233         if (sock == NULL)
1234                 return (ISC_R_NOMEMORY);
1235
1236         ret = ISC_R_UNEXPECTED;
1237
1238         sock->magic = 0;
1239         sock->references = 0;
1240
1241         sock->manager = manager;
1242         sock->type = type;
1243         sock->fd = -1;
1244
1245         ISC_LINK_INIT(sock, link);
1246
1247         sock->recvcmsgbuf = NULL;
1248         sock->sendcmsgbuf = NULL;
1249
1250         /*
1251          * set up cmsg buffers
1252          */
1253         cmsgbuflen = 0;
1254 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1255         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1256 #endif
1257 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1258         cmsgbuflen += cmsg_space(sizeof(struct timeval));
1259 #endif
1260         sock->recvcmsgbuflen = cmsgbuflen;
1261         if (sock->recvcmsgbuflen != 0U) {
1262                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1263                 if (sock->recvcmsgbuf == NULL)
1264                         goto error;
1265         }
1266
1267         cmsgbuflen = 0;
1268 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1269         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1270 #endif
1271         sock->sendcmsgbuflen = cmsgbuflen;
1272         if (sock->sendcmsgbuflen != 0U) {
1273                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1274                 if (sock->sendcmsgbuf == NULL)
1275                         goto error;
1276         }
1277
1278         /*
1279          * set up list of readers and writers to be initially empty
1280          */
1281         ISC_LIST_INIT(sock->recv_list);
1282         ISC_LIST_INIT(sock->send_list);
1283         ISC_LIST_INIT(sock->accept_list);
1284         sock->connect_ev = NULL;
1285         sock->pending_recv = 0;
1286         sock->pending_send = 0;
1287         sock->pending_accept = 0;
1288         sock->listener = 0;
1289         sock->connected = 0;
1290         sock->connecting = 0;
1291         sock->bound = 0;
1292
1293         /*
1294          * initialize the lock
1295          */
1296         if (isc_mutex_init(&sock->lock) != ISC_R_SUCCESS) {
1297                 sock->magic = 0;
1298                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1299                                  "isc_mutex_init() %s",
1300                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1301                                                 ISC_MSG_FAILED, "failed"));
1302                 ret = ISC_R_UNEXPECTED;
1303                 goto error;
1304         }
1305
1306         /*
1307          * Initialize readable and writable events
1308          */
1309         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1310                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1311                        NULL, sock, sock, NULL, NULL);
1312         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1313                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1314                        NULL, sock, sock, NULL, NULL);
1315
1316         sock->magic = SOCKET_MAGIC;
1317         *socketp = sock;
1318
1319         return (ISC_R_SUCCESS);
1320
1321  error:
1322         if (sock->recvcmsgbuf != NULL)
1323                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1324                             sock->recvcmsgbuflen);
1325         if (sock->sendcmsgbuf != NULL)
1326                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1327                             sock->sendcmsgbuflen);
1328         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1329
1330         return (ret);
1331 }
1332
1333 /*
1334  * This event requires that the various lists be empty, that the reference
1335  * count be 1, and that the magic number is valid.  The other socket bits,
1336  * like the lock, must be initialized as well.  The fd associated must be
1337  * marked as closed, by setting it to -1 on close, or this routine will
1338  * also close the socket.
1339  */
1340 static void
1341 free_socket(isc_socket_t **socketp) {
1342         isc_socket_t *sock = *socketp;
1343
1344         INSIST(sock->references == 0);
1345         INSIST(VALID_SOCKET(sock));
1346         INSIST(!sock->connecting);
1347         INSIST(!sock->pending_recv);
1348         INSIST(!sock->pending_send);
1349         INSIST(!sock->pending_accept);
1350         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1351         INSIST(ISC_LIST_EMPTY(sock->send_list));
1352         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1353         INSIST(!ISC_LINK_LINKED(sock, link));
1354
1355         if (sock->recvcmsgbuf != NULL)
1356                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1357                             sock->recvcmsgbuflen);
1358         if (sock->sendcmsgbuf != NULL)
1359                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1360                             sock->sendcmsgbuflen);
1361
1362         sock->magic = 0;
1363
1364         DESTROYLOCK(&sock->lock);
1365
1366         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1367
1368         *socketp = NULL;
1369 }
1370
1371 /*
1372  * Create a new 'type' socket managed by 'manager'.  Events
1373  * will be posted to 'task' and when dispatched 'action' will be
1374  * called with 'arg' as the arg value.  The new socket is returned
1375  * in 'socketp'.
1376  */
1377 isc_result_t
1378 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1379                   isc_socket_t **socketp)
1380 {
1381         isc_socket_t *sock = NULL;
1382         isc_result_t ret;
1383 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
1384         int on = 1;
1385 #endif
1386         char strbuf[ISC_STRERRORSIZE];
1387         const char *err = "socket";
1388
1389         REQUIRE(VALID_MANAGER(manager));
1390         REQUIRE(socketp != NULL && *socketp == NULL);
1391
1392         ret = allocate_socket(manager, type, &sock);
1393         if (ret != ISC_R_SUCCESS)
1394                 return (ret);
1395
1396         sock->pf = pf;
1397         switch (type) {
1398         case isc_sockettype_udp:
1399                 sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1400                 break;
1401         case isc_sockettype_tcp:
1402                 sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1403                 break;
1404         }
1405
1406 #ifdef F_DUPFD
1407         /*
1408          * Leave a space for stdio to work in.
1409          */
1410         if (sock->fd >= 0 && sock->fd < 20) {
1411                 int new, tmp;
1412                 new = fcntl(sock->fd, F_DUPFD, 20);
1413                 tmp = errno;
1414                 (void)close(sock->fd);
1415                 errno = tmp;
1416                 sock->fd = new;
1417                 err = "isc_socket_create: fcntl";
1418         }
1419 #endif
1420
1421         if (sock->fd >= (int)FD_SETSIZE) {
1422                 (void)close(sock->fd);
1423                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1424                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1425                                isc_msgcat, ISC_MSGSET_SOCKET,
1426                                ISC_MSG_TOOMANYFDS,
1427                                "%s: too many open file descriptors", "socket");
1428                 free_socket(&sock);
1429                 return (ISC_R_NORESOURCES);
1430         }
1431         
1432         if (sock->fd < 0) {
1433                 free_socket(&sock);
1434
1435                 switch (errno) {
1436                 case EMFILE:
1437                 case ENFILE:
1438                 case ENOBUFS:
1439                         return (ISC_R_NORESOURCES);
1440
1441                 case EPROTONOSUPPORT:
1442                 case EPFNOSUPPORT:
1443                 case EAFNOSUPPORT:
1444                 /*
1445                  * Linux 2.2 (and maybe others) return EINVAL instead of
1446                  * EAFNOSUPPORT.
1447                  */
1448                 case EINVAL:
1449                         return (ISC_R_FAMILYNOSUPPORT);
1450
1451                 default:
1452                         isc__strerror(errno, strbuf, sizeof(strbuf));
1453                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1454                                          "%s() %s: %s", err,
1455                                          isc_msgcat_get(isc_msgcat,
1456                                                         ISC_MSGSET_GENERAL,
1457                                                         ISC_MSG_FAILED,
1458                                                         "failed"),
1459                                          strbuf);
1460                         return (ISC_R_UNEXPECTED);
1461                 }
1462         }
1463
1464         if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
1465                 (void)close(sock->fd);
1466                 free_socket(&sock);
1467                 return (ISC_R_UNEXPECTED);
1468         }
1469
1470 #ifdef SO_BSDCOMPAT
1471         if (setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
1472                        (void *)&on, sizeof(on)) < 0) {
1473                 isc__strerror(errno, strbuf, sizeof(strbuf));
1474                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1475                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
1476                                  sock->fd,
1477                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1478                                                 ISC_MSG_FAILED, "failed"),
1479                                  strbuf);
1480                 /* Press on... */
1481         }
1482 #endif
1483
1484 #if defined(USE_CMSG)
1485         if (type == isc_sockettype_udp) {
1486
1487 #if defined(SO_TIMESTAMP)
1488                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1489                                (void *)&on, sizeof(on)) < 0
1490                     && errno != ENOPROTOOPT) {
1491                         isc__strerror(errno, strbuf, sizeof(strbuf));
1492                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1493                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1494                                          sock->fd, 
1495                                          isc_msgcat_get(isc_msgcat,
1496                                                         ISC_MSGSET_GENERAL,
1497                                                         ISC_MSG_FAILED,
1498                                                         "failed"),
1499                                          strbuf);
1500                         /* Press on... */
1501                 }
1502 #endif /* SO_TIMESTAMP */
1503
1504 #if defined(ISC_PLATFORM_HAVEIPV6)
1505                 if (pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
1506                         /*
1507                          * Warn explicitly because this anomaly can be hidden
1508                          * in usual operation (and unexpectedly appear later).
1509                          */
1510                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1511                                          "No buffer available to receive "
1512                                          "IPv6 destination");
1513                 }
1514 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1515 #ifdef IPV6_RECVPKTINFO
1516                 /* 2292bis */
1517                 if ((pf == AF_INET6)
1518                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1519                                    (void *)&on, sizeof(on)) < 0)) {
1520                         isc__strerror(errno, strbuf, sizeof(strbuf));
1521                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1522                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
1523                                          "%s: %s", sock->fd,
1524                                          isc_msgcat_get(isc_msgcat,
1525                                                         ISC_MSGSET_GENERAL,
1526                                                         ISC_MSG_FAILED,
1527                                                         "failed"),
1528                                          strbuf);
1529                 }
1530 #else
1531                 /* 2292 */
1532                 if ((pf == AF_INET6)
1533                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1534                                    (void *)&on, sizeof(on)) < 0)) {
1535                         isc__strerror(errno, strbuf, sizeof(strbuf));
1536                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1537                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1538                                          sock->fd,
1539                                          isc_msgcat_get(isc_msgcat,
1540                                                         ISC_MSGSET_GENERAL,
1541                                                         ISC_MSG_FAILED,
1542                                                         "failed"),
1543                                          strbuf);
1544                 }
1545 #endif /* IPV6_RECVPKTINFO */
1546 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
1547 #ifdef IPV6_USE_MIN_MTU        /*2292bis, not too common yet*/
1548                 /* use minimum MTU */
1549                 if (pf == AF_INET6) {
1550                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
1551                                          IPV6_USE_MIN_MTU,
1552                                          (void *)&on, sizeof(on));
1553                 }
1554 #endif
1555 #endif /* ISC_PLATFORM_HAVEIPV6 */
1556
1557         }
1558 #endif /* USE_CMSG */
1559
1560         sock->references = 1;
1561         *socketp = sock;
1562
1563         LOCK(&manager->lock);
1564
1565         /*
1566          * Note we don't have to lock the socket like we normally would because
1567          * there are no external references to it yet.
1568          */
1569
1570         manager->fds[sock->fd] = sock;
1571         manager->fdstate[sock->fd] = MANAGED;
1572         ISC_LIST_APPEND(manager->socklist, sock, link);
1573         if (manager->maxfd < sock->fd)
1574                 manager->maxfd = sock->fd;
1575
1576         UNLOCK(&manager->lock);
1577
1578         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1579                    ISC_MSG_CREATED, "created");
1580
1581         return (ISC_R_SUCCESS);
1582 }
1583
1584 /*
1585  * Attach to a socket.  Caller must explicitly detach when it is done.
1586  */
1587 void
1588 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1589         REQUIRE(VALID_SOCKET(sock));
1590         REQUIRE(socketp != NULL && *socketp == NULL);
1591
1592         LOCK(&sock->lock);
1593         sock->references++;
1594         UNLOCK(&sock->lock);
1595
1596         *socketp = sock;
1597 }
1598
1599 /*
1600  * Dereference a socket.  If this is the last reference to it, clean things
1601  * up by destroying the socket.
1602  */
1603 void
1604 isc_socket_detach(isc_socket_t **socketp) {
1605         isc_socket_t *sock;
1606         isc_boolean_t kill_socket = ISC_FALSE;
1607
1608         REQUIRE(socketp != NULL);
1609         sock = *socketp;
1610         REQUIRE(VALID_SOCKET(sock));
1611
1612         LOCK(&sock->lock);
1613         REQUIRE(sock->references > 0);
1614         sock->references--;
1615         if (sock->references == 0)
1616                 kill_socket = ISC_TRUE;
1617         UNLOCK(&sock->lock);
1618
1619         if (kill_socket)
1620                 destroy(&sock);
1621
1622         *socketp = NULL;
1623 }
1624
1625 /*
1626  * I/O is possible on a given socket.  Schedule an event to this task that
1627  * will call an internal function to do the I/O.  This will charge the
1628  * task with the I/O operation and let our select loop handler get back
1629  * to doing something real as fast as possible.
1630  *
1631  * The socket and manager must be locked before calling this function.
1632  */
1633 static void
1634 dispatch_recv(isc_socket_t *sock) {
1635         intev_t *iev;
1636         isc_socketevent_t *ev;
1637
1638         INSIST(!sock->pending_recv);
1639
1640         ev = ISC_LIST_HEAD(sock->recv_list);
1641         if (ev == NULL)
1642                 return;
1643
1644         sock->pending_recv = 1;
1645         iev = &sock->readable_ev;
1646
1647         socket_log(sock, NULL, EVENT, NULL, 0, 0,
1648                    "dispatch_recv:  event %p -> task %p", ev, ev->ev_sender);
1649
1650         sock->references++;
1651         iev->ev_sender = sock;
1652         iev->ev_action = internal_recv;
1653         iev->ev_arg = sock;
1654
1655         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1656 }
1657
1658 static void
1659 dispatch_send(isc_socket_t *sock) {
1660         intev_t *iev;
1661         isc_socketevent_t *ev;
1662
1663         INSIST(!sock->pending_send);
1664
1665         ev = ISC_LIST_HEAD(sock->send_list);
1666         if (ev == NULL)
1667                 return;
1668
1669         sock->pending_send = 1;
1670         iev = &sock->writable_ev;
1671
1672         socket_log(sock, NULL, EVENT, NULL, 0, 0,
1673                    "dispatch_send:  event %p -> task %p", ev, ev->ev_sender);
1674
1675         sock->references++;
1676         iev->ev_sender = sock;
1677         iev->ev_action = internal_send;
1678         iev->ev_arg = sock;
1679
1680         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1681 }
1682
1683 /*
1684  * Dispatch an internal accept event.
1685  */
1686 static void
1687 dispatch_accept(isc_socket_t *sock) {
1688         intev_t *iev;
1689         isc_socket_newconnev_t *ev;
1690
1691         INSIST(!sock->pending_accept);
1692
1693         /*
1694          * Are there any done events left, or were they all canceled
1695          * before the manager got the socket lock?
1696          */
1697         ev = ISC_LIST_HEAD(sock->accept_list);
1698         if (ev == NULL)
1699                 return;
1700
1701         sock->pending_accept = 1;
1702         iev = &sock->readable_ev;
1703
1704         sock->references++;  /* keep socket around for this internal event */
1705         iev->ev_sender = sock;
1706         iev->ev_action = internal_accept;
1707         iev->ev_arg = sock;
1708
1709         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1710 }
1711
1712 static void
1713 dispatch_connect(isc_socket_t *sock) {
1714         intev_t *iev;
1715         isc_socket_connev_t *ev;
1716
1717         iev = &sock->writable_ev;
1718
1719         ev = sock->connect_ev;
1720         INSIST(ev != NULL); /* XXX */
1721
1722         INSIST(sock->connecting);
1723
1724         sock->references++;  /* keep socket around for this internal event */
1725         iev->ev_sender = sock;
1726         iev->ev_action = internal_connect;
1727         iev->ev_arg = sock;
1728
1729         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1730 }
1731
1732 /*
1733  * Dequeue an item off the given socket's read queue, set the result code
1734  * in the done event to the one provided, and send it to the task it was
1735  * destined for.
1736  *
1737  * If the event to be sent is on a list, remove it before sending.  If
1738  * asked to, send and detach from the socket as well.
1739  *
1740  * Caller must have the socket locked if the event is attached to the socket.
1741  */
1742 static void
1743 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1744         isc_task_t *task;
1745
1746         task = (*dev)->ev_sender;
1747
1748         (*dev)->ev_sender = sock;
1749
1750         if (ISC_LINK_LINKED(*dev, ev_link))
1751                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1752
1753         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1754             == ISC_SOCKEVENTATTR_ATTACHED)
1755                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1756         else
1757                 isc_task_send(task, (isc_event_t **)dev);
1758 }
1759
1760 /*
1761  * See comments for send_recvdone_event() above.
1762  *
1763  * Caller must have the socket locked if the event is attached to the socket.
1764  */
1765 static void
1766 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1767         isc_task_t *task;
1768
1769         INSIST(dev != NULL && *dev != NULL);
1770
1771         task = (*dev)->ev_sender;
1772         (*dev)->ev_sender = sock;
1773
1774         if (ISC_LINK_LINKED(*dev, ev_link))
1775                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1776
1777         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1778             == ISC_SOCKEVENTATTR_ATTACHED)
1779                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1780         else
1781                 isc_task_send(task, (isc_event_t **)dev);
1782 }
1783
1784 /*
1785  * Call accept() on a socket, to get the new file descriptor.  The listen
1786  * socket is used as a prototype to create a new isc_socket_t.  The new
1787  * socket has one outstanding reference.  The task receiving the event
1788  * will be detached from just after the event is delivered.
1789  *
1790  * On entry to this function, the event delivered is the internal
1791  * readable event, and the first item on the accept_list should be
1792  * the done event we want to send.  If the list is empty, this is a no-op,
1793  * so just unlock and return.
1794  */
1795 static void
1796 internal_accept(isc_task_t *me, isc_event_t *ev) {
1797         isc_socket_t *sock;
1798         isc_socketmgr_t *manager;
1799         isc_socket_newconnev_t *dev;
1800         isc_task_t *task;
1801         ISC_SOCKADDR_LEN_T addrlen;
1802         int fd;
1803         isc_result_t result = ISC_R_SUCCESS;
1804         char strbuf[ISC_STRERRORSIZE];
1805         const char *err = "accept";
1806
1807         UNUSED(me);
1808
1809         sock = ev->ev_sender;
1810         INSIST(VALID_SOCKET(sock));
1811
1812         LOCK(&sock->lock);
1813         socket_log(sock, NULL, TRACE,
1814                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1815                    "internal_accept called, locked socket");
1816
1817         manager = sock->manager;
1818         INSIST(VALID_MANAGER(manager));
1819
1820         INSIST(sock->listener);
1821         INSIST(sock->pending_accept == 1);
1822         sock->pending_accept = 0;
1823
1824         INSIST(sock->references > 0);
1825         sock->references--;  /* the internal event is done with this socket */
1826         if (sock->references == 0) {
1827                 UNLOCK(&sock->lock);
1828                 destroy(&sock);
1829                 return;
1830         }
1831
1832         /*
1833          * Get the first item off the accept list.
1834          * If it is empty, unlock the socket and return.
1835          */
1836         dev = ISC_LIST_HEAD(sock->accept_list);
1837         if (dev == NULL) {
1838                 UNLOCK(&sock->lock);
1839                 return;
1840         }
1841
1842         /*
1843          * Try to accept the new connection.  If the accept fails with
1844          * EAGAIN or EINTR, simply poke the watcher to watch this socket
1845          * again.  Also ignore ECONNRESET, which has been reported to
1846          * be spuriously returned on Linux 2.2.19 although it is not
1847          * a documented error for accept().  ECONNABORTED has been
1848          * reported for Solaris 8.  The rest are thrown in not because
1849          * we have seen them but because they are ignored by other
1850          * deamons such as BIND 8 and Apache.
1851          */
1852
1853         addrlen = sizeof(dev->newsocket->address.type);
1854         memset(&dev->newsocket->address.type.sa, 0, addrlen);
1855         fd = accept(sock->fd, &dev->newsocket->address.type.sa,
1856                     (void *)&addrlen);
1857
1858 #ifdef F_DUPFD
1859         /*
1860          * Leave a space for stdio to work in.
1861          */
1862         if (fd >= 0 && fd < 20) {
1863                 int new, tmp;
1864                 new = fcntl(fd, F_DUPFD, 20);
1865                 tmp = errno;
1866                 (void)close(fd);
1867                 errno = tmp;
1868                 fd = new;
1869                 err = "fcntl";
1870         }
1871 #endif
1872
1873         if (fd < 0) {
1874                 if (SOFT_ERROR(errno))
1875                         goto soft_error;
1876                 switch (errno) {
1877                 case ENOBUFS:
1878                 case ENFILE:
1879                 case ENOMEM:
1880                 case ECONNRESET:
1881                 case ECONNABORTED:
1882                 case EHOSTUNREACH:
1883                 case EHOSTDOWN:
1884                 case ENETUNREACH:
1885                 case ENETDOWN:
1886                 case ECONNREFUSED:
1887 #ifdef EPROTO
1888                 case EPROTO:
1889 #endif
1890 #ifdef ENONET
1891                 case ENONET:
1892 #endif
1893                         goto soft_error;
1894                 default:
1895                         break;
1896                 }
1897                 isc__strerror(errno, strbuf, sizeof(strbuf));
1898                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1899                                  "internal_accept: %s() %s: %s", err,
1900                                  isc_msgcat_get(isc_msgcat,
1901                                                 ISC_MSGSET_GENERAL,
1902                                                 ISC_MSG_FAILED,
1903                                                 "failed"),
1904                                  strbuf);
1905                 fd = -1;
1906                 result = ISC_R_UNEXPECTED;
1907         } else {
1908                 if (addrlen == 0U) {
1909                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1910                                          "internal_accept(): "
1911                                          "accept() failed to return "
1912                                          "remote address");
1913
1914                         (void)close(fd);
1915                         goto soft_error;
1916                 } else if (dev->newsocket->address.type.sa.sa_family !=
1917                            sock->pf)
1918                 {
1919                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1920                                          "internal_accept(): "
1921                                          "accept() returned peer address "
1922                                          "family %u (expected %u)", 
1923                                          dev->newsocket->address.
1924                                          type.sa.sa_family,
1925                                          sock->pf);
1926                         (void)close(fd);
1927                         goto soft_error;
1928                 } else if (fd >= (int)FD_SETSIZE) {
1929                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1930                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1931                                        isc_msgcat, ISC_MSGSET_SOCKET,
1932                                        ISC_MSG_TOOMANYFDS,
1933                                        "%s: too many open file descriptors",
1934                                        "accept");
1935                         (void)close(fd);
1936                         goto soft_error;
1937                 }
1938         }
1939
1940         if (fd != -1) {
1941                 dev->newsocket->address.length = addrlen;
1942                 dev->newsocket->pf = sock->pf;
1943         }
1944
1945         /*
1946          * Pull off the done event.
1947          */
1948         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
1949
1950         /*
1951          * Poke watcher if there are more pending accepts.
1952          */
1953         if (!ISC_LIST_EMPTY(sock->accept_list))
1954                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
1955
1956         UNLOCK(&sock->lock);
1957
1958         if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
1959                 (void)close(fd);
1960                 fd = -1;
1961                 result = ISC_R_UNEXPECTED;
1962         }
1963
1964         /*
1965          * -1 means the new socket didn't happen.
1966          */
1967         if (fd != -1) {
1968                 LOCK(&manager->lock);
1969                 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
1970
1971                 dev->newsocket->fd = fd;
1972                 dev->newsocket->bound = 1;
1973                 dev->newsocket->connected = 1;
1974
1975                 /*
1976                  * Save away the remote address
1977                  */
1978                 dev->address = dev->newsocket->address;
1979
1980                 manager->fds[fd] = dev->newsocket;
1981                 manager->fdstate[fd] = MANAGED;
1982                 if (manager->maxfd < fd)
1983                         manager->maxfd = fd;
1984
1985                 socket_log(sock, &dev->newsocket->address, CREATION,
1986                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
1987                            "accepted connection, new socket %p",
1988                            dev->newsocket);
1989
1990                 UNLOCK(&manager->lock);
1991         } else {
1992                 dev->newsocket->references--;
1993                 free_socket(&dev->newsocket);
1994         }
1995         
1996         /*
1997          * Fill in the done event details and send it off.
1998          */
1999         dev->result = result;
2000         task = dev->ev_sender;
2001         dev->ev_sender = sock;
2002
2003         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2004         return;
2005
2006  soft_error:
2007         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
2008         UNLOCK(&sock->lock);
2009         return;
2010 }
2011
2012 static void
2013 internal_recv(isc_task_t *me, isc_event_t *ev) {
2014         isc_socketevent_t *dev;
2015         isc_socket_t *sock;
2016
2017         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
2018
2019         sock = ev->ev_sender;
2020         INSIST(VALID_SOCKET(sock));
2021
2022         LOCK(&sock->lock);
2023         socket_log(sock, NULL, IOEVENT,
2024                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2025                    "internal_recv: task %p got event %p", me, ev);
2026
2027         INSIST(sock->pending_recv == 1);
2028         sock->pending_recv = 0;
2029
2030         INSIST(sock->references > 0);
2031         sock->references--;  /* the internal event is done with this socket */
2032         if (sock->references == 0) {
2033                 UNLOCK(&sock->lock);
2034                 destroy(&sock);
2035                 return;
2036         }
2037
2038         /*
2039          * Try to do as much I/O as possible on this socket.  There are no
2040          * limits here, currently.
2041          */
2042         dev = ISC_LIST_HEAD(sock->recv_list);
2043         while (dev != NULL) {
2044                 switch (doio_recv(sock, dev)) {
2045                 case DOIO_SOFT:
2046                         goto poke;
2047
2048                 case DOIO_EOF:
2049                         /*
2050                          * read of 0 means the remote end was closed.
2051                          * Run through the event queue and dispatch all
2052                          * the events with an EOF result code.
2053                          */
2054                         do {
2055                                 dev->result = ISC_R_EOF;
2056                                 send_recvdone_event(sock, &dev);
2057                                 dev = ISC_LIST_HEAD(sock->recv_list);
2058                         } while (dev != NULL);
2059                         goto poke;
2060
2061                 case DOIO_SUCCESS:
2062                 case DOIO_HARD:
2063                         send_recvdone_event(sock, &dev);
2064                         break;
2065                 }
2066
2067                 dev = ISC_LIST_HEAD(sock->recv_list);
2068         }
2069
2070  poke:
2071         if (!ISC_LIST_EMPTY(sock->recv_list))
2072                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2073
2074         UNLOCK(&sock->lock);
2075 }
2076
2077 static void
2078 internal_send(isc_task_t *me, isc_event_t *ev) {
2079         isc_socketevent_t *dev;
2080         isc_socket_t *sock;
2081
2082         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2083
2084         /*
2085          * Find out what socket this is and lock it.
2086          */
2087         sock = (isc_socket_t *)ev->ev_sender;
2088         INSIST(VALID_SOCKET(sock));
2089
2090         LOCK(&sock->lock);
2091         socket_log(sock, NULL, IOEVENT,
2092                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2093                    "internal_send: task %p got event %p", me, ev);
2094
2095         INSIST(sock->pending_send == 1);
2096         sock->pending_send = 0;
2097
2098         INSIST(sock->references > 0);
2099         sock->references--;  /* the internal event is done with this socket */
2100         if (sock->references == 0) {
2101                 UNLOCK(&sock->lock);
2102                 destroy(&sock);
2103                 return;
2104         }
2105
2106         /*
2107          * Try to do as much I/O as possible on this socket.  There are no
2108          * limits here, currently.
2109          */
2110         dev = ISC_LIST_HEAD(sock->send_list);
2111         while (dev != NULL) {
2112                 switch (doio_send(sock, dev)) {
2113                 case DOIO_SOFT:
2114                         goto poke;
2115
2116                 case DOIO_HARD:
2117                 case DOIO_SUCCESS:
2118                         send_senddone_event(sock, &dev);
2119                         break;
2120                 }
2121
2122                 dev = ISC_LIST_HEAD(sock->send_list);
2123         }
2124
2125  poke:
2126         if (!ISC_LIST_EMPTY(sock->send_list))
2127                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2128
2129         UNLOCK(&sock->lock);
2130 }
2131
2132 static void
2133 process_fds(isc_socketmgr_t *manager, int maxfd,
2134             fd_set *readfds, fd_set *writefds)
2135 {
2136         int i;
2137         isc_socket_t *sock;
2138         isc_boolean_t unlock_sock;
2139
2140         REQUIRE(maxfd <= (int)FD_SETSIZE);
2141
2142         /*
2143          * Process read/writes on other fds here.  Avoid locking
2144          * and unlocking twice if both reads and writes are possible.
2145          */
2146         for (i = 0; i < maxfd; i++) {
2147 #ifdef ISC_PLATFORM_USETHREADS
2148                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
2149                         continue;
2150 #endif /* ISC_PLATFORM_USETHREADS */
2151
2152                 if (manager->fdstate[i] == CLOSE_PENDING) {
2153                         manager->fdstate[i] = CLOSED;
2154                         FD_CLR(i, &manager->read_fds);
2155                         FD_CLR(i, &manager->write_fds);
2156
2157                         (void)close(i);
2158
2159                         continue;
2160                 }
2161
2162                 sock = manager->fds[i];
2163                 unlock_sock = ISC_FALSE;
2164                 if (FD_ISSET(i, readfds)) {
2165                         if (sock == NULL) {
2166                                 FD_CLR(i, &manager->read_fds);
2167                                 goto check_write;
2168                         }
2169                         unlock_sock = ISC_TRUE;
2170                         LOCK(&sock->lock);
2171                         if (!SOCK_DEAD(sock)) {
2172                                 if (sock->listener)
2173                                         dispatch_accept(sock);
2174                                 else
2175                                         dispatch_recv(sock);
2176                         }
2177                         FD_CLR(i, &manager->read_fds);
2178                 }
2179         check_write:
2180                 if (FD_ISSET(i, writefds)) {
2181                         if (sock == NULL) {
2182                                 FD_CLR(i, &manager->write_fds);
2183                                 continue;
2184                         }
2185                         if (!unlock_sock) {
2186                                 unlock_sock = ISC_TRUE;
2187                                 LOCK(&sock->lock);
2188                         }
2189                         if (!SOCK_DEAD(sock)) {
2190                                 if (sock->connecting)
2191                                         dispatch_connect(sock);
2192                                 else
2193                                         dispatch_send(sock);
2194                         }
2195                         FD_CLR(i, &manager->write_fds);
2196                 }
2197                 if (unlock_sock)
2198                         UNLOCK(&sock->lock);
2199         }
2200 }
2201
2202 #ifdef ISC_PLATFORM_USETHREADS
2203 /*
2204  * This is the thread that will loop forever, always in a select or poll
2205  * call.
2206  *
2207  * When select returns something to do, track down what thread gets to do
2208  * this I/O and post the event to it.
2209  */
2210 static isc_threadresult_t
2211 watcher(void *uap) {
2212         isc_socketmgr_t *manager = uap;
2213         isc_boolean_t done;
2214         int ctlfd;
2215         int cc;
2216         fd_set readfds;
2217         fd_set writefds;
2218         int msg, fd;
2219         int maxfd;
2220         char strbuf[ISC_STRERRORSIZE];
2221
2222         /*
2223          * Get the control fd here.  This will never change.
2224          */
2225         LOCK(&manager->lock);
2226         ctlfd = manager->pipe_fds[0];
2227
2228         done = ISC_FALSE;
2229         while (!done) {
2230                 do {
2231                         readfds = manager->read_fds;
2232                         writefds = manager->write_fds;
2233                         maxfd = manager->maxfd + 1;
2234
2235                         UNLOCK(&manager->lock);
2236
2237                         cc = select(maxfd, &readfds, &writefds, NULL, NULL);
2238                         if (cc < 0) {
2239                                 if (!SOFT_ERROR(errno)) {
2240                                         isc__strerror(errno, strbuf,
2241                                                       sizeof(strbuf));
2242                                         FATAL_ERROR(__FILE__, __LINE__,
2243                                                     "select() %s: %s",
2244                                                     isc_msgcat_get(isc_msgcat,
2245                                                             ISC_MSGSET_GENERAL,
2246                                                             ISC_MSG_FAILED,
2247                                                             "failed"),
2248                                                     strbuf);
2249                                 }
2250                         }
2251
2252                         LOCK(&manager->lock);
2253                 } while (cc < 0);
2254
2255
2256                 /*
2257                  * Process reads on internal, control fd.
2258                  */
2259                 if (FD_ISSET(ctlfd, &readfds)) {
2260                         for (;;) {
2261                                 select_readmsg(manager, &fd, &msg);
2262
2263                                 manager_log(manager, IOEVENT,
2264                                             isc_msgcat_get(isc_msgcat,
2265                                                      ISC_MSGSET_SOCKET,
2266                                                      ISC_MSG_WATCHERMSG,
2267                                                      "watcher got message %d"),
2268                                                      msg);
2269
2270                                 /*
2271                                  * Nothing to read?
2272                                  */
2273                                 if (msg == SELECT_POKE_NOTHING)
2274                                         break;
2275
2276                                 /*
2277                                  * Handle shutdown message.  We really should
2278                                  * jump out of this loop right away, but
2279                                  * it doesn't matter if we have to do a little
2280                                  * more work first.
2281                                  */
2282                                 if (msg == SELECT_POKE_SHUTDOWN) {
2283                                         done = ISC_TRUE;
2284
2285                                         break;
2286                                 }
2287
2288                                 /*
2289                                  * This is a wakeup on a socket.  Look
2290                                  * at the event queue for both read and write,
2291                                  * and decide if we need to watch on it now
2292                                  * or not.
2293                                  */
2294                                 wakeup_socket(manager, fd, msg);
2295                         }
2296                 }
2297
2298                 process_fds(manager, maxfd, &readfds, &writefds);
2299         }
2300
2301         manager_log(manager, TRACE,
2302                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2303                                    ISC_MSG_EXITING, "watcher exiting"));
2304
2305         UNLOCK(&manager->lock);
2306         return ((isc_threadresult_t)0);
2307 }
2308 #endif /* ISC_PLATFORM_USETHREADS */
2309
2310 /*
2311  * Create a new socket manager.
2312  */
2313 isc_result_t
2314 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2315         isc_socketmgr_t *manager;
2316 #ifdef ISC_PLATFORM_USETHREADS
2317         char strbuf[ISC_STRERRORSIZE];
2318 #endif
2319
2320         REQUIRE(managerp != NULL && *managerp == NULL);
2321
2322 #ifndef ISC_PLATFORM_USETHREADS
2323         if (socketmgr != NULL) {
2324                 socketmgr->refs++;
2325                 *managerp = socketmgr;
2326                 return (ISC_R_SUCCESS);
2327         }
2328 #endif /* ISC_PLATFORM_USETHREADS */
2329
2330         manager = isc_mem_get(mctx, sizeof(*manager));
2331         if (manager == NULL)
2332                 return (ISC_R_NOMEMORY);
2333
2334         manager->magic = SOCKET_MANAGER_MAGIC;
2335         manager->mctx = NULL;
2336         memset(manager->fds, 0, sizeof(manager->fds));
2337         ISC_LIST_INIT(manager->socklist);
2338         if (isc_mutex_init(&manager->lock) != ISC_R_SUCCESS) {
2339                 isc_mem_put(mctx, manager, sizeof(*manager));
2340                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2341                                  "isc_mutex_init() %s",
2342                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2343                                                 ISC_MSG_FAILED, "failed"));
2344                 return (ISC_R_UNEXPECTED);
2345         }
2346 #ifdef ISC_PLATFORM_USETHREADS
2347         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2348                 DESTROYLOCK(&manager->lock);
2349                 isc_mem_put(mctx, manager, sizeof(*manager));
2350                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2351                                  "isc_condition_init() %s",
2352                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2353                                                 ISC_MSG_FAILED, "failed"));
2354                 return (ISC_R_UNEXPECTED);
2355         }
2356
2357         /*
2358          * Create the special fds that will be used to wake up the
2359          * select/poll loop when something internal needs to be done.
2360          */
2361         if (pipe(manager->pipe_fds) != 0) {
2362                 DESTROYLOCK(&manager->lock);
2363                 isc_mem_put(mctx, manager, sizeof(*manager));
2364                 isc__strerror(errno, strbuf, sizeof(strbuf));
2365                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2366                                  "pipe() %s: %s",
2367                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2368                                                 ISC_MSG_FAILED, "failed"),
2369                                  strbuf);
2370
2371                 return (ISC_R_UNEXPECTED);
2372         }
2373
2374         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
2375 #if 0
2376         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
2377 #endif
2378 #else /* ISC_PLATFORM_USETHREADS */
2379         manager->refs = 1;
2380 #endif /* ISC_PLATFORM_USETHREADS */
2381
2382         /*
2383          * Set up initial state for the select loop
2384          */
2385         FD_ZERO(&manager->read_fds);
2386         FD_ZERO(&manager->write_fds);
2387 #ifdef ISC_PLATFORM_USETHREADS
2388         FD_SET(manager->pipe_fds[0], &manager->read_fds);
2389         manager->maxfd = manager->pipe_fds[0];
2390 #else /* ISC_PLATFORM_USETHREADS */
2391         manager->maxfd = 0;
2392 #endif /* ISC_PLATFORM_USETHREADS */
2393         memset(manager->fdstate, 0, sizeof(manager->fdstate));
2394
2395 #ifdef ISC_PLATFORM_USETHREADS
2396         /*
2397          * Start up the select/poll thread.
2398          */
2399         if (isc_thread_create(watcher, manager, &manager->watcher) !=
2400             ISC_R_SUCCESS) {
2401                 (void)close(manager->pipe_fds[0]);
2402                 (void)close(manager->pipe_fds[1]);
2403                 DESTROYLOCK(&manager->lock);
2404                 isc_mem_put(mctx, manager, sizeof(*manager));
2405                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2406                                  "isc_thread_create() %s",
2407                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2408                                                 ISC_MSG_FAILED, "failed"));
2409                 return (ISC_R_UNEXPECTED);
2410         }
2411 #endif /* ISC_PLATFORM_USETHREADS */
2412         isc_mem_attach(mctx, &manager->mctx);
2413
2414 #ifndef ISC_PLATFORM_USETHREADS
2415         socketmgr = manager;
2416 #endif /* ISC_PLATFORM_USETHREADS */
2417         *managerp = manager;
2418
2419         return (ISC_R_SUCCESS);
2420 }
2421
2422 void
2423 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
2424         isc_socketmgr_t *manager;
2425         int i;
2426         isc_mem_t *mctx;
2427
2428         /*
2429          * Destroy a socket manager.
2430          */
2431
2432         REQUIRE(managerp != NULL);
2433         manager = *managerp;
2434         REQUIRE(VALID_MANAGER(manager));
2435
2436 #ifndef ISC_PLATFORM_USETHREADS
2437         if (manager->refs > 1) {
2438                 manager->refs--;
2439                 *managerp = NULL;
2440                 return;
2441         }
2442 #endif /* ISC_PLATFORM_USETHREADS */
2443
2444         LOCK(&manager->lock);
2445
2446 #ifdef ISC_PLATFORM_USETHREADS
2447         /*
2448          * Wait for all sockets to be destroyed.
2449          */
2450         while (!ISC_LIST_EMPTY(manager->socklist)) {
2451                 manager_log(manager, CREATION,
2452                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2453                                            ISC_MSG_SOCKETSREMAIN,
2454                                            "sockets exist"));
2455                 WAIT(&manager->shutdown_ok, &manager->lock);
2456         }
2457 #else /* ISC_PLATFORM_USETHREADS */
2458         /*
2459          * Hope all sockets have been destroyed.
2460          */
2461         if (!ISC_LIST_EMPTY(manager->socklist)) {
2462                 manager_log(manager, CREATION,
2463                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2464                                            ISC_MSG_SOCKETSREMAIN,
2465                                            "sockets exist"));
2466                 INSIST(0);
2467         }
2468 #endif /* ISC_PLATFORM_USETHREADS */
2469
2470         UNLOCK(&manager->lock);
2471
2472         /*
2473          * Here, poke our select/poll thread.  Do this by closing the write
2474          * half of the pipe, which will send EOF to the read half.
2475          * This is currently a no-op in the non-threaded case.
2476          */
2477         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2478
2479 #ifdef ISC_PLATFORM_USETHREADS
2480         /*
2481          * Wait for thread to exit.
2482          */
2483         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
2484                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2485                                  "isc_thread_join() %s",
2486                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2487                                                 ISC_MSG_FAILED, "failed"));
2488 #endif /* ISC_PLATFORM_USETHREADS */
2489
2490         /*
2491          * Clean up.
2492          */
2493 #ifdef ISC_PLATFORM_USETHREADS
2494         (void)close(manager->pipe_fds[0]);
2495         (void)close(manager->pipe_fds[1]);
2496         (void)isc_condition_destroy(&manager->shutdown_ok);
2497 #endif /* ISC_PLATFORM_USETHREADS */
2498
2499         for (i = 0; i < (int)FD_SETSIZE; i++)
2500                 if (manager->fdstate[i] == CLOSE_PENDING)
2501                         (void)close(i);
2502
2503         DESTROYLOCK(&manager->lock);
2504         manager->magic = 0;
2505         mctx= manager->mctx;
2506         isc_mem_put(mctx, manager, sizeof(*manager));
2507
2508         isc_mem_detach(&mctx);
2509
2510         *managerp = NULL;
2511 }
2512
2513 static isc_result_t
2514 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2515             unsigned int flags)
2516 {
2517         int io_state;
2518         isc_boolean_t have_lock = ISC_FALSE;
2519         isc_task_t *ntask = NULL;
2520         isc_result_t result = ISC_R_SUCCESS;
2521
2522         dev->ev_sender = task;
2523
2524         if (sock->type == isc_sockettype_udp) {
2525                 io_state = doio_recv(sock, dev);
2526         } else {
2527                 LOCK(&sock->lock);
2528                 have_lock = ISC_TRUE;
2529
2530                 if (ISC_LIST_EMPTY(sock->recv_list))
2531                         io_state = doio_recv(sock, dev);
2532                 else
2533                         io_state = DOIO_SOFT;
2534         }
2535
2536         switch (io_state) {
2537         case DOIO_SOFT:
2538                 /*
2539                  * We couldn't read all or part of the request right now, so
2540                  * queue it.
2541                  *
2542                  * Attach to socket and to task
2543                  */
2544                 isc_task_attach(task, &ntask);
2545                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2546
2547                 if (!have_lock) {
2548                         LOCK(&sock->lock);
2549                         have_lock = ISC_TRUE;
2550                 }
2551
2552                 /*
2553                  * Enqueue the request.  If the socket was previously not being
2554                  * watched, poke the watcher to start paying attention to it.
2555                  */
2556                 if (ISC_LIST_EMPTY(sock->recv_list))
2557                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2558                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2559
2560                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2561                            "socket_recv: event %p -> task %p",
2562                            dev, ntask);
2563
2564                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2565                         result = ISC_R_INPROGRESS;
2566                 break;
2567
2568         case DOIO_EOF:
2569                 dev->result = ISC_R_EOF;
2570                 /* fallthrough */
2571
2572         case DOIO_HARD:
2573         case DOIO_SUCCESS:
2574                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2575                         send_recvdone_event(sock, &dev);
2576                 break;
2577         }
2578
2579         if (have_lock)
2580                 UNLOCK(&sock->lock);
2581
2582         return (result);
2583 }
2584
2585 isc_result_t
2586 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2587                  unsigned int minimum, isc_task_t *task,
2588                  isc_taskaction_t action, const void *arg)
2589 {
2590         isc_socketevent_t *dev;
2591         isc_socketmgr_t *manager;
2592         unsigned int iocount;
2593         isc_buffer_t *buffer;
2594
2595         REQUIRE(VALID_SOCKET(sock));
2596         REQUIRE(buflist != NULL);
2597         REQUIRE(!ISC_LIST_EMPTY(*buflist));
2598         REQUIRE(task != NULL);
2599         REQUIRE(action != NULL);
2600
2601         manager = sock->manager;
2602         REQUIRE(VALID_MANAGER(manager));
2603
2604         iocount = isc_bufferlist_availablecount(buflist);
2605         REQUIRE(iocount > 0);
2606
2607         INSIST(sock->bound);
2608
2609         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2610         if (dev == NULL) {
2611                 return (ISC_R_NOMEMORY);
2612         }
2613
2614         /*
2615          * UDP sockets are always partial read
2616          */
2617         if (sock->type == isc_sockettype_udp)
2618                 dev->minimum = 1;
2619         else {
2620                 if (minimum == 0)
2621                         dev->minimum = iocount;
2622                 else
2623                         dev->minimum = minimum;
2624         }
2625
2626         /*
2627          * Move each buffer from the passed in list to our internal one.
2628          */
2629         buffer = ISC_LIST_HEAD(*buflist);
2630         while (buffer != NULL) {
2631                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2632                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2633                 buffer = ISC_LIST_HEAD(*buflist);
2634         }
2635
2636         return (socket_recv(sock, dev, task, 0));
2637 }
2638
2639 isc_result_t
2640 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
2641                 isc_task_t *task, isc_taskaction_t action, const void *arg)
2642 {
2643         isc_socketevent_t *dev;
2644         isc_socketmgr_t *manager;
2645
2646         REQUIRE(VALID_SOCKET(sock));
2647         REQUIRE(action != NULL);
2648
2649         manager = sock->manager;
2650         REQUIRE(VALID_MANAGER(manager));
2651
2652         INSIST(sock->bound);
2653
2654         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2655         if (dev == NULL)
2656                 return (ISC_R_NOMEMORY);
2657
2658         return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
2659 }
2660
2661 isc_result_t
2662 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
2663                  unsigned int minimum, isc_task_t *task,
2664                  isc_socketevent_t *event, unsigned int flags)
2665 {
2666         event->ev_sender = sock;
2667         event->result = ISC_R_UNEXPECTED;
2668         ISC_LIST_INIT(event->bufferlist);
2669         event->region = *region;
2670         event->n = 0;
2671         event->offset = 0;
2672         event->attributes = 0;
2673
2674         /*
2675          * UDP sockets are always partial read.
2676          */
2677         if (sock->type == isc_sockettype_udp)
2678                 event->minimum = 1;
2679         else {
2680                 if (minimum == 0)
2681                         event->minimum = region->length;
2682                 else
2683                         event->minimum = minimum;
2684         }
2685
2686         return (socket_recv(sock, event, task, flags));
2687 }
2688
2689 static isc_result_t
2690 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2691             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2692             unsigned int flags)
2693 {
2694         int io_state;
2695         isc_boolean_t have_lock = ISC_FALSE;
2696         isc_task_t *ntask = NULL;
2697         isc_result_t result = ISC_R_SUCCESS;
2698
2699         dev->ev_sender = task;
2700
2701         set_dev_address(address, sock, dev);
2702         if (pktinfo != NULL) {
2703                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2704                 dev->pktinfo = *pktinfo;
2705
2706                 if (!isc_sockaddr_issitelocal(&dev->address) &&
2707                     !isc_sockaddr_islinklocal(&dev->address)) {
2708                         socket_log(sock, NULL, TRACE, isc_msgcat,
2709                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
2710                                    "pktinfo structure provided, ifindex %u "
2711                                    "(set to 0)", pktinfo->ipi6_ifindex);
2712
2713                         /*
2714                          * Set the pktinfo index to 0 here, to let the
2715                          * kernel decide what interface it should send on.
2716                          */
2717                         dev->pktinfo.ipi6_ifindex = 0;
2718                 }
2719         }
2720
2721         if (sock->type == isc_sockettype_udp)
2722                 io_state = doio_send(sock, dev);
2723         else {
2724                 LOCK(&sock->lock);
2725                 have_lock = ISC_TRUE;
2726
2727                 if (ISC_LIST_EMPTY(sock->send_list))
2728                         io_state = doio_send(sock, dev);
2729                 else
2730                         io_state = DOIO_SOFT;
2731         }
2732
2733         switch (io_state) {
2734         case DOIO_SOFT:
2735                 /*
2736                  * We couldn't send all or part of the request right now, so
2737                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
2738                  */
2739                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2740                         isc_task_attach(task, &ntask);
2741                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2742
2743                         if (!have_lock) {
2744                                 LOCK(&sock->lock);
2745                                 have_lock = ISC_TRUE;
2746                         }
2747
2748                         /*
2749                          * Enqueue the request.  If the socket was previously
2750                          * not being watched, poke the watcher to start
2751                          * paying attention to it.
2752                          */
2753                         if (ISC_LIST_EMPTY(sock->send_list))
2754                                 select_poke(sock->manager, sock->fd,
2755                                             SELECT_POKE_WRITE);
2756                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2757
2758                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
2759                                    "socket_send: event %p -> task %p",
2760                                    dev, ntask);
2761
2762                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2763                                 result = ISC_R_INPROGRESS;
2764                         break;
2765                 }
2766
2767         case DOIO_HARD:
2768         case DOIO_SUCCESS:
2769                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2770                         send_senddone_event(sock, &dev);
2771                 break;
2772         }
2773
2774         if (have_lock)
2775                 UNLOCK(&sock->lock);
2776
2777         return (result);
2778 }
2779
2780 isc_result_t
2781 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
2782                 isc_task_t *task, isc_taskaction_t action, const void *arg)
2783 {
2784         /*
2785          * REQUIRE() checking is performed in isc_socket_sendto().
2786          */
2787         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
2788                                   NULL));
2789 }
2790
2791 isc_result_t
2792 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
2793                   isc_task_t *task, isc_taskaction_t action, const void *arg,
2794                   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2795 {
2796         isc_socketevent_t *dev;
2797         isc_socketmgr_t *manager;
2798
2799         REQUIRE(VALID_SOCKET(sock));
2800         REQUIRE(region != NULL);
2801         REQUIRE(task != NULL);
2802         REQUIRE(action != NULL);
2803
2804         manager = sock->manager;
2805         REQUIRE(VALID_MANAGER(manager));
2806
2807         INSIST(sock->bound);
2808
2809         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2810         if (dev == NULL) {
2811                 return (ISC_R_NOMEMORY);
2812         }
2813
2814         dev->region = *region;
2815
2816         return (socket_send(sock, dev, task, address, pktinfo, 0));
2817 }
2818
2819 isc_result_t
2820 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2821                  isc_task_t *task, isc_taskaction_t action, const void *arg)
2822 {
2823         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
2824                                    NULL));
2825 }
2826
2827 isc_result_t
2828 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
2829                    isc_task_t *task, isc_taskaction_t action, const void *arg,
2830                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2831 {
2832         isc_socketevent_t *dev;
2833         isc_socketmgr_t *manager;
2834         unsigned int iocount;
2835         isc_buffer_t *buffer;
2836
2837         REQUIRE(VALID_SOCKET(sock));
2838         REQUIRE(buflist != NULL);
2839         REQUIRE(!ISC_LIST_EMPTY(*buflist));
2840         REQUIRE(task != NULL);
2841         REQUIRE(action != NULL);
2842
2843         manager = sock->manager;
2844         REQUIRE(VALID_MANAGER(manager));
2845
2846         iocount = isc_bufferlist_usedcount(buflist);
2847         REQUIRE(iocount > 0);
2848
2849         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2850         if (dev == NULL) {
2851                 return (ISC_R_NOMEMORY);
2852         }
2853
2854         /*
2855          * Move each buffer from the passed in list to our internal one.
2856          */
2857         buffer = ISC_LIST_HEAD(*buflist);
2858         while (buffer != NULL) {
2859                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2860                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2861                 buffer = ISC_LIST_HEAD(*buflist);
2862         }
2863
2864         return (socket_send(sock, dev, task, address, pktinfo, 0));
2865 }
2866
2867 isc_result_t
2868 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
2869                    isc_task_t *task,
2870                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2871                    isc_socketevent_t *event, unsigned int flags)
2872 {
2873         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
2874         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
2875                 REQUIRE(sock->type == isc_sockettype_udp);
2876         event->ev_sender = sock;
2877         event->result = ISC_R_UNEXPECTED;
2878         ISC_LIST_INIT(event->bufferlist);
2879         event->region = *region;
2880         event->n = 0;
2881         event->offset = 0;
2882         event->attributes = 0;
2883
2884         return (socket_send(sock, event, task, address, pktinfo, flags));
2885 }
2886
2887 isc_result_t
2888 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr) {
2889         char strbuf[ISC_STRERRORSIZE];
2890         int on = 1;
2891
2892         LOCK(&sock->lock);
2893
2894         INSIST(!sock->bound);
2895
2896         if (sock->pf != sockaddr->type.sa.sa_family) {
2897                 UNLOCK(&sock->lock);
2898                 return (ISC_R_FAMILYMISMATCH);
2899         }
2900         /*
2901          * Only set SO_REUSEADDR when we want a specific port.
2902          */
2903         if (isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2904             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2905                        sizeof(on)) < 0) {
2906                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2907                                  "setsockopt(%d) %s", sock->fd,
2908                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2909                                                 ISC_MSG_FAILED, "failed"));
2910                 /* Press on... */
2911         }
2912         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2913                 UNLOCK(&sock->lock);
2914                 switch (errno) {
2915                 case EACCES:
2916                         return (ISC_R_NOPERM);
2917                 case EADDRNOTAVAIL:
2918                         return (ISC_R_ADDRNOTAVAIL);
2919                 case EADDRINUSE:
2920                         return (ISC_R_ADDRINUSE);
2921                 case EINVAL:
2922                         return (ISC_R_BOUND);
2923                 default:
2924                         isc__strerror(errno, strbuf, sizeof(strbuf));
2925                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2926                                          strbuf);
2927                         return (ISC_R_UNEXPECTED);
2928                 }
2929         }
2930
2931         socket_log(sock, sockaddr, TRACE,
2932                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
2933         sock->bound = 1;
2934
2935         UNLOCK(&sock->lock);
2936         return (ISC_R_SUCCESS);
2937 }
2938
2939 isc_result_t
2940 isc_socket_filter(isc_socket_t *sock, const char *filter) {
2941 #ifdef SO_ACCEPTFILTER
2942         char strbuf[ISC_STRERRORSIZE];
2943         struct accept_filter_arg afa;
2944 #else
2945         UNUSED(sock);
2946         UNUSED(filter);
2947 #endif
2948
2949         REQUIRE(VALID_SOCKET(sock));
2950
2951 #ifdef SO_ACCEPTFILTER
2952         bzero(&afa, sizeof(afa));
2953         strncpy(afa.af_name, filter, sizeof(afa.af_name));
2954         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
2955                          &afa, sizeof(afa)) == -1) {
2956                 isc__strerror(errno, strbuf, sizeof(strbuf));
2957                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2958                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
2959                            strbuf);
2960                 return (ISC_R_FAILURE);
2961         }
2962         return (ISC_R_SUCCESS);
2963 #else
2964         return (ISC_R_NOTIMPLEMENTED);
2965 #endif
2966 }
2967
2968 /*
2969  * Set up to listen on a given socket.  We do this by creating an internal
2970  * event that will be dispatched when the socket has read activity.  The
2971  * watcher will send the internal event to the task when there is a new
2972  * connection.
2973  *
2974  * Unlike in read, we don't preallocate a done event here.  Every time there
2975  * is a new connection we'll have to allocate a new one anyway, so we might
2976  * as well keep things simple rather than having to track them.
2977  */
2978 isc_result_t
2979 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
2980         char strbuf[ISC_STRERRORSIZE];
2981
2982         REQUIRE(VALID_SOCKET(sock));
2983
2984         LOCK(&sock->lock);
2985
2986         REQUIRE(!sock->listener);
2987         REQUIRE(sock->bound);
2988         REQUIRE(sock->type == isc_sockettype_tcp);
2989
2990         if (backlog == 0)
2991                 backlog = SOMAXCONN;
2992
2993         if (listen(sock->fd, (int)backlog) < 0) {
2994                 UNLOCK(&sock->lock);
2995                 isc__strerror(errno, strbuf, sizeof(strbuf));
2996
2997                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
2998
2999                 return (ISC_R_UNEXPECTED);
3000         }
3001
3002         sock->listener = 1;
3003
3004         UNLOCK(&sock->lock);
3005         return (ISC_R_SUCCESS);
3006 }
3007
3008 /*
3009  * This should try to do agressive accept() XXXMLG
3010  */
3011 isc_result_t
3012 isc_socket_accept(isc_socket_t *sock,
3013                   isc_task_t *task, isc_taskaction_t action, const void *arg)
3014 {
3015         isc_socket_newconnev_t *dev;
3016         isc_socketmgr_t *manager;
3017         isc_task_t *ntask = NULL;
3018         isc_socket_t *nsock;
3019         isc_result_t ret;
3020         isc_boolean_t do_poke = ISC_FALSE;
3021
3022         REQUIRE(VALID_SOCKET(sock));
3023         manager = sock->manager;
3024         REQUIRE(VALID_MANAGER(manager));
3025
3026         LOCK(&sock->lock);
3027
3028         REQUIRE(sock->listener);
3029
3030         /*
3031          * Sender field is overloaded here with the task we will be sending
3032          * this event to.  Just before the actual event is delivered the
3033          * actual ev_sender will be touched up to be the socket.
3034          */
3035         dev = (isc_socket_newconnev_t *)
3036                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3037                                    action, arg, sizeof(*dev));
3038         if (dev == NULL) {
3039                 UNLOCK(&sock->lock);
3040                 return (ISC_R_NOMEMORY);
3041         }
3042         ISC_LINK_INIT(dev, ev_link);
3043
3044         ret = allocate_socket(manager, sock->type, &nsock);
3045         if (ret != ISC_R_SUCCESS) {
3046                 isc_event_free(ISC_EVENT_PTR(&dev));
3047                 UNLOCK(&sock->lock);
3048                 return (ret);
3049         }
3050
3051         /*
3052          * Attach to socket and to task.
3053          */
3054         isc_task_attach(task, &ntask);
3055         nsock->references++;
3056
3057         dev->ev_sender = ntask;
3058         dev->newsocket = nsock;
3059
3060         /*
3061          * Poke watcher here.  We still have the socket locked, so there
3062          * is no race condition.  We will keep the lock for such a short
3063          * bit of time waking it up now or later won't matter all that much.
3064          */
3065         if (ISC_LIST_EMPTY(sock->accept_list))
3066                 do_poke = ISC_TRUE;
3067
3068         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
3069
3070         if (do_poke)
3071                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
3072
3073         UNLOCK(&sock->lock);
3074         return (ISC_R_SUCCESS);
3075 }
3076
3077 isc_result_t
3078 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3079                    isc_task_t *task, isc_taskaction_t action, const void *arg)
3080 {
3081         isc_socket_connev_t *dev;
3082         isc_task_t *ntask = NULL;
3083         isc_socketmgr_t *manager;
3084         int cc;
3085         char strbuf[ISC_STRERRORSIZE];
3086
3087         REQUIRE(VALID_SOCKET(sock));
3088         REQUIRE(addr != NULL);
3089         REQUIRE(task != NULL);
3090         REQUIRE(action != NULL);
3091
3092         manager = sock->manager;
3093         REQUIRE(VALID_MANAGER(manager));
3094         REQUIRE(addr != NULL);
3095
3096         if (isc_sockaddr_ismulticast(addr))
3097                 return (ISC_R_MULTICAST);
3098
3099         LOCK(&sock->lock);
3100
3101         REQUIRE(!sock->connecting);
3102
3103         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3104                                                         ISC_SOCKEVENT_CONNECT,
3105                                                         action, arg,
3106                                                         sizeof(*dev));
3107         if (dev == NULL) {
3108                 UNLOCK(&sock->lock);
3109                 return (ISC_R_NOMEMORY);
3110         }
3111         ISC_LINK_INIT(dev, ev_link);
3112
3113         /*
3114          * Try to do the connect right away, as there can be only one
3115          * outstanding, and it might happen to complete.
3116          */
3117         sock->address = *addr;
3118         cc = connect(sock->fd, &addr->type.sa, addr->length);
3119         if (cc < 0) {
3120                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
3121                         goto queue;
3122
3123                 switch (errno) {
3124 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
3125                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
3126                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3127                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3128                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3129                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3130 #ifdef EHOSTDOWN
3131                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3132 #endif
3133                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3134                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3135                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3136                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3137                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3138 #undef ERROR_MATCH
3139                 }
3140
3141                 sock->connected = 0;
3142
3143                 isc__strerror(errno, strbuf, sizeof(strbuf));
3144                 UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
3145
3146                 UNLOCK(&sock->lock);
3147                 isc_event_free(ISC_EVENT_PTR(&dev));
3148                 return (ISC_R_UNEXPECTED);
3149
3150         err_exit:
3151                 sock->connected = 0;
3152                 isc_task_send(task, ISC_EVENT_PTR(&dev));
3153
3154                 UNLOCK(&sock->lock);
3155                 return (ISC_R_SUCCESS);
3156         }
3157
3158         /*
3159          * If connect completed, fire off the done event.
3160          */
3161         if (cc == 0) {
3162                 sock->connected = 1;
3163                 sock->bound = 1;
3164                 dev->result = ISC_R_SUCCESS;
3165                 isc_task_send(task, ISC_EVENT_PTR(&dev));
3166
3167                 UNLOCK(&sock->lock);
3168                 return (ISC_R_SUCCESS);
3169         }
3170
3171  queue:
3172
3173         /*
3174          * Attach to task.
3175          */
3176         isc_task_attach(task, &ntask);
3177
3178         sock->connecting = 1;
3179
3180         dev->ev_sender = ntask;
3181
3182         /*
3183          * Poke watcher here.  We still have the socket locked, so there
3184          * is no race condition.  We will keep the lock for such a short
3185          * bit of time waking it up now or later won't matter all that much.
3186          */
3187         if (sock->connect_ev == NULL)
3188                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
3189
3190         sock->connect_ev = dev;
3191
3192         UNLOCK(&sock->lock);
3193         return (ISC_R_SUCCESS);
3194 }
3195
3196 /*
3197  * Called when a socket with a pending connect() finishes.
3198  */
3199 static void
3200 internal_connect(isc_task_t *me, isc_event_t *ev) {
3201         isc_socket_t *sock;
3202         isc_socket_connev_t *dev;
3203         isc_task_t *task;
3204         int cc;
3205         ISC_SOCKADDR_LEN_T optlen;
3206         char strbuf[ISC_STRERRORSIZE];
3207         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3208
3209         UNUSED(me);
3210         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3211
3212         sock = ev->ev_sender;
3213         INSIST(VALID_SOCKET(sock));
3214
3215         LOCK(&sock->lock);
3216
3217         /*
3218          * When the internal event was sent the reference count was bumped
3219          * to keep the socket around for us.  Decrement the count here.
3220          */
3221         INSIST(sock->references > 0);
3222         sock->references--;
3223         if (sock->references == 0) {
3224                 UNLOCK(&sock->lock);
3225                 destroy(&sock);
3226                 return;
3227         }
3228
3229         /*
3230          * Has this event been canceled?
3231          */
3232         dev = sock->connect_ev;
3233         if (dev == NULL) {
3234                 INSIST(!sock->connecting);
3235                 UNLOCK(&sock->lock);
3236                 return;
3237         }
3238
3239         INSIST(sock->connecting);
3240         sock->connecting = 0;
3241
3242         /*
3243          * Get any possible error status here.
3244          */
3245         optlen = sizeof(cc);
3246         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
3247                        (void *)&cc, (void *)&optlen) < 0)
3248                 cc = errno;
3249         else
3250                 errno = cc;
3251
3252         if (errno != 0) {
3253                 /*
3254                  * If the error is EAGAIN, just re-select on this
3255                  * fd and pretend nothing strange happened.
3256                  */
3257                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
3258                         sock->connecting = 1;
3259                         select_poke(sock->manager, sock->fd,
3260                                     SELECT_POKE_CONNECT);
3261                         UNLOCK(&sock->lock);
3262
3263                         return;
3264                 }
3265
3266                 /*
3267                  * Translate other errors into ISC_R_* flavors.
3268                  */
3269                 switch (errno) {
3270 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
3271                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
3272                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3273                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3274                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3275                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3276 #ifdef EHOSTDOWN
3277                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3278 #endif
3279                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3280                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3281                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3282                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3283                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
3284                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3285 #undef ERROR_MATCH
3286                 default:
3287                         dev->result = ISC_R_UNEXPECTED;
3288                         isc_sockaddr_format(&sock->address, peerbuf,
3289                                             sizeof(peerbuf));
3290                         isc__strerror(errno, strbuf, sizeof(strbuf));
3291                         UNEXPECTED_ERROR(__FILE__, __LINE__,
3292                                          "internal_connect: connect(%s) %s",
3293                                          peerbuf, strbuf);
3294                 }
3295         } else {
3296                 dev->result = ISC_R_SUCCESS;
3297                 sock->connected = 1;
3298                 sock->bound = 1;
3299         }
3300
3301         sock->connect_ev = NULL;
3302
3303         UNLOCK(&sock->lock);
3304
3305         task = dev->ev_sender;
3306         dev->ev_sender = sock;
3307         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3308 }
3309
3310 isc_result_t
3311 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3312         isc_result_t ret;
3313
3314         REQUIRE(VALID_SOCKET(sock));
3315         REQUIRE(addressp != NULL);
3316
3317         LOCK(&sock->lock);
3318
3319         if (sock->connected) {
3320                 *addressp = sock->address;
3321                 ret = ISC_R_SUCCESS;
3322         } else {
3323                 ret = ISC_R_NOTCONNECTED;
3324         }
3325
3326         UNLOCK(&sock->lock);
3327
3328         return (ret);
3329 }
3330
3331 isc_result_t
3332 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3333         ISC_SOCKADDR_LEN_T len;
3334         isc_result_t ret;
3335         char strbuf[ISC_STRERRORSIZE];
3336
3337         REQUIRE(VALID_SOCKET(sock));
3338         REQUIRE(addressp != NULL);
3339
3340         LOCK(&sock->lock);
3341
3342         if (!sock->bound) {
3343                 ret = ISC_R_NOTBOUND;
3344                 goto out;
3345         }
3346
3347         ret = ISC_R_SUCCESS;
3348
3349         len = sizeof(addressp->type);
3350         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3351                 isc__strerror(errno, strbuf, sizeof(strbuf));
3352                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3353                                  strbuf);
3354                 ret = ISC_R_UNEXPECTED;
3355                 goto out;
3356         }
3357         addressp->length = (unsigned int)len;
3358
3359  out:
3360         UNLOCK(&sock->lock);
3361
3362         return (ret);
3363 }
3364
3365 /*
3366  * Run through the list of events on this socket, and cancel the ones
3367  * queued for task "task" of type "how".  "how" is a bitmask.
3368  */
3369 void
3370 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3371
3372         REQUIRE(VALID_SOCKET(sock));
3373
3374         /*
3375          * Quick exit if there is nothing to do.  Don't even bother locking
3376          * in this case.
3377          */
3378         if (how == 0)
3379                 return;
3380
3381         LOCK(&sock->lock);
3382
3383         /*
3384          * All of these do the same thing, more or less.
3385          * Each will:
3386          *      o If the internal event is marked as "posted" try to
3387          *        remove it from the task's queue.  If this fails, mark it
3388          *        as canceled instead, and let the task clean it up later.
3389          *      o For each I/O request for that task of that type, post
3390          *        its done event with status of "ISC_R_CANCELED".
3391          *      o Reset any state needed.
3392          */
3393         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
3394             && !ISC_LIST_EMPTY(sock->recv_list)) {
3395                 isc_socketevent_t      *dev;
3396                 isc_socketevent_t      *next;
3397                 isc_task_t             *current_task;
3398
3399                 dev = ISC_LIST_HEAD(sock->recv_list);
3400
3401                 while (dev != NULL) {
3402                         current_task = dev->ev_sender;
3403                         next = ISC_LIST_NEXT(dev, ev_link);
3404
3405                         if ((task == NULL) || (task == current_task)) {
3406                                 dev->result = ISC_R_CANCELED;
3407                                 send_recvdone_event(sock, &dev);
3408                         }
3409                         dev = next;
3410                 }
3411         }
3412
3413         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
3414             && !ISC_LIST_EMPTY(sock->send_list)) {
3415                 isc_socketevent_t      *dev;
3416                 isc_socketevent_t      *next;
3417                 isc_task_t             *current_task;
3418
3419                 dev = ISC_LIST_HEAD(sock->send_list);
3420
3421                 while (dev != NULL) {
3422                         current_task = dev->ev_sender;
3423                         next = ISC_LIST_NEXT(dev, ev_link);
3424
3425                         if ((task == NULL) || (task == current_task)) {
3426                                 dev->result = ISC_R_CANCELED;
3427                                 send_senddone_event(sock, &dev);
3428                         }
3429                         dev = next;
3430                 }
3431         }
3432
3433         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3434             && !ISC_LIST_EMPTY(sock->accept_list)) {
3435                 isc_socket_newconnev_t *dev;
3436                 isc_socket_newconnev_t *next;
3437                 isc_task_t             *current_task;
3438
3439                 dev = ISC_LIST_HEAD(sock->accept_list);
3440                 while (dev != NULL) {
3441                         current_task = dev->ev_sender;
3442                         next = ISC_LIST_NEXT(dev, ev_link);
3443
3444                         if ((task == NULL) || (task == current_task)) {
3445
3446                                 ISC_LIST_UNLINK(sock->accept_list, dev,
3447                                                 ev_link);
3448
3449                                 dev->newsocket->references--;
3450                                 free_socket(&dev->newsocket);
3451
3452                                 dev->result = ISC_R_CANCELED;
3453                                 dev->ev_sender = sock;
3454                                 isc_task_sendanddetach(&current_task,
3455                                                        ISC_EVENT_PTR(&dev));
3456                         }
3457
3458                         dev = next;
3459                 }
3460         }
3461
3462         /*
3463          * Connecting is not a list.
3464          */
3465         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3466             && sock->connect_ev != NULL) {
3467                 isc_socket_connev_t    *dev;
3468                 isc_task_t             *current_task;
3469
3470                 INSIST(sock->connecting);
3471                 sock->connecting = 0;
3472
3473                 dev = sock->connect_ev;
3474                 current_task = dev->ev_sender;
3475
3476                 if ((task == NULL) || (task == current_task)) {
3477                         sock->connect_ev = NULL;
3478
3479                         dev->result = ISC_R_CANCELED;
3480                         dev->ev_sender = sock;
3481                         isc_task_sendanddetach(&current_task,
3482                                                ISC_EVENT_PTR(&dev));
3483                 }
3484         }
3485
3486         UNLOCK(&sock->lock);
3487 }
3488
3489 isc_sockettype_t
3490 isc_socket_gettype(isc_socket_t *sock) {
3491         REQUIRE(VALID_SOCKET(sock));
3492
3493         return (sock->type);
3494 }
3495
3496 isc_boolean_t
3497 isc_socket_isbound(isc_socket_t *sock) {
3498         isc_boolean_t val;
3499
3500         LOCK(&sock->lock);
3501         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3502         UNLOCK(&sock->lock);
3503
3504         return (val);
3505 }
3506
3507 void
3508 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3509 #if defined(IPV6_V6ONLY)
3510         int onoff = yes ? 1 : 0;
3511 #else
3512         UNUSED(yes);
3513         UNUSED(sock);
3514 #endif
3515
3516         REQUIRE(VALID_SOCKET(sock));
3517
3518 #ifdef IPV6_V6ONLY
3519         if (sock->pf == AF_INET6) {
3520                 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3521                                  (void *)&onoff, sizeof(onoff));
3522         }
3523 #endif
3524 }
3525
3526 #ifndef ISC_PLATFORM_USETHREADS
3527 void
3528 isc__socketmgr_getfdsets(fd_set *readset, fd_set *writeset, int *maxfd) {
3529         if (socketmgr == NULL)
3530                 *maxfd = 0;
3531         else {
3532                 *readset = socketmgr->read_fds;
3533                 *writeset = socketmgr->write_fds;
3534                 *maxfd = socketmgr->maxfd + 1;
3535         }
3536 }
3537
3538 isc_result_t
3539 isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) {
3540         isc_socketmgr_t *manager = socketmgr;
3541
3542         if (manager == NULL)
3543                 return (ISC_R_NOTFOUND);
3544
3545         process_fds(manager, maxfd, readset, writeset);
3546         return (ISC_R_SUCCESS);
3547 }
3548 #endif /* ISC_PLATFORM_USETHREADS */