Merge from vendor branch GCC:
[dragonfly.git] / contrib / bind-9.3 / lib / isc / unix / socket.c
1 /*
2  * Copyright (C) 2004, 2005  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1998-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: socket.c,v 1.207.2.19.2.22 2005/11/03 23:08:42 marka Exp $ */
19
20 #include <config.h>
21
22 #include <sys/param.h>
23 #include <sys/types.h>
24 #include <sys/socket.h>
25 #include <sys/time.h>
26 #include <sys/uio.h>
27
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <stddef.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <unistd.h>
34
35 #include <isc/buffer.h>
36 #include <isc/bufferlist.h>
37 #include <isc/condition.h>
38 #include <isc/formatcheck.h>
39 #include <isc/list.h>
40 #include <isc/log.h>
41 #include <isc/mem.h>
42 #include <isc/msgs.h>
43 #include <isc/mutex.h>
44 #include <isc/net.h>
45 #include <isc/platform.h>
46 #include <isc/print.h>
47 #include <isc/region.h>
48 #include <isc/socket.h>
49 #include <isc/strerror.h>
50 #include <isc/task.h>
51 #include <isc/thread.h>
52 #include <isc/util.h>
53
54 #include "errno2result.h"
55
56 #ifndef ISC_PLATFORM_USETHREADS
57 #include "socket_p.h"
58 #endif /* ISC_PLATFORM_USETHREADS */
59
60 /*
61  * Some systems define the socket length argument as an int, some as size_t,
62  * some as socklen_t.  This is here so it can be easily changed if needed.
63  */
64 #ifndef ISC_SOCKADDR_LEN_T
65 #define ISC_SOCKADDR_LEN_T unsigned int
66 #endif
67
68 /*
69  * Define what the possible "soft" errors can be.  These are non-fatal returns
70  * of various network related functions, like recv() and so on.
71  *
72  * For some reason, BSDI (and perhaps others) will sometimes return <0
73  * from recv() but will have errno==0.  This is broken, but we have to
74  * work around it here.
75  */
76 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
77                          (e) == EWOULDBLOCK || \
78                          (e) == EINTR || \
79                          (e) == 0)
80
81 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
82
83 /*
84  * DLVL(90)  --  Function entry/exit and other tracing.
85  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
86  * DLVL(60)  --  Socket data send/receive
87  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
88  * DLVL(20)  --  Socket creation/destruction.
89  */
90 #define TRACE_LEVEL             90
91 #define CORRECTNESS_LEVEL       70
92 #define IOEVENT_LEVEL           60
93 #define EVENT_LEVEL             50
94 #define CREATION_LEVEL          20
95
96 #define TRACE           DLVL(TRACE_LEVEL)
97 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
98 #define IOEVENT         DLVL(IOEVENT_LEVEL)
99 #define EVENT           DLVL(EVENT_LEVEL)
100 #define CREATION        DLVL(CREATION_LEVEL)
101
102 typedef isc_event_t intev_t;
103
104 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
105 #define VALID_SOCKET(t)         ISC_MAGIC_VALID(t, SOCKET_MAGIC)
106
107 /*
108  * IPv6 control information.  If the socket is an IPv6 socket we want
109  * to collect the destination address and interface so the client can
110  * set them on outgoing packets.
111  */
112 #ifdef ISC_PLATFORM_HAVEIPV6
113 #ifndef USE_CMSG
114 #define USE_CMSG        1
115 #endif
116 #endif
117
118 /*
119  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
120  * a setsockopt() like interface to request timestamps, and if the OS
121  * doesn't do it for us, call gettimeofday() on every UDP receive?
122  */
123 #ifdef SO_TIMESTAMP
124 #ifndef USE_CMSG
125 #define USE_CMSG        1
126 #endif
127 #endif
128
129 /*
130  * The number of times a send operation is repeated if the result is EINTR.
131  */
132 #define NRETRIES 10
133
134 struct isc_socket {
135         /* Not locked. */
136         unsigned int            magic;
137         isc_socketmgr_t        *manager;
138         isc_mutex_t             lock;
139         isc_sockettype_t        type;
140
141         /* Locked by socket lock. */
142         ISC_LINK(isc_socket_t)  link;
143         unsigned int            references;
144         int                     fd;
145         int                     pf;
146
147         ISC_LIST(isc_socketevent_t)             send_list;
148         ISC_LIST(isc_socketevent_t)             recv_list;
149         ISC_LIST(isc_socket_newconnev_t)        accept_list;
150         isc_socket_connev_t                    *connect_ev;
151
152         /*
153          * Internal events.  Posted when a descriptor is readable or
154          * writable.  These are statically allocated and never freed.
155          * They will be set to non-purgable before use.
156          */
157         intev_t                 readable_ev;
158         intev_t                 writable_ev;
159
160         isc_sockaddr_t          address;  /* remote address */
161
162         unsigned int            pending_recv : 1,
163                                 pending_send : 1,
164                                 pending_accept : 1,
165                                 listener : 1, /* listener socket */
166                                 connected : 1,
167                                 connecting : 1, /* connect pending */
168                                 bound : 1; /* bound to local addr */
169
170 #ifdef ISC_NET_RECVOVERFLOW
171         unsigned char           overflow; /* used for MSG_TRUNC fake */
172 #endif
173
174         char                    *recvcmsgbuf;
175         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
176         char                    *sendcmsgbuf;
177         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
178 };
179
180 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
181 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
182
183 struct isc_socketmgr {
184         /* Not locked. */
185         unsigned int            magic;
186         isc_mem_t              *mctx;
187         isc_mutex_t             lock;
188         /* Locked by manager lock. */
189         ISC_LIST(isc_socket_t)  socklist;
190         fd_set                  read_fds;
191         fd_set                  write_fds;
192         isc_socket_t           *fds[FD_SETSIZE];
193         int                     fdstate[FD_SETSIZE];
194         int                     maxfd;
195 #ifdef ISC_PLATFORM_USETHREADS
196         isc_thread_t            watcher;
197         isc_condition_t         shutdown_ok;
198         int                     pipe_fds[2];
199 #else /* ISC_PLATFORM_USETHREADS */
200         unsigned int            refs;
201 #endif /* ISC_PLATFORM_USETHREADS */
202 };
203
204 #ifndef ISC_PLATFORM_USETHREADS
205 static isc_socketmgr_t *socketmgr = NULL;
206 #endif /* ISC_PLATFORM_USETHREADS */
207
208 #define CLOSED          0       /* this one must be zero */
209 #define MANAGED         1
210 #define CLOSE_PENDING   2
211
212 /*
213  * send() and recv() iovec counts
214  */
215 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
216 #ifdef ISC_NET_RECVOVERFLOW
217 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
218 #else
219 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
220 #endif
221
222 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
223 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
224 static void free_socket(isc_socket_t **);
225 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
226                                     isc_socket_t **);
227 static void destroy(isc_socket_t **);
228 static void internal_accept(isc_task_t *, isc_event_t *);
229 static void internal_connect(isc_task_t *, isc_event_t *);
230 static void internal_recv(isc_task_t *, isc_event_t *);
231 static void internal_send(isc_task_t *, isc_event_t *);
232 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
233 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
234                               struct msghdr *, struct iovec *, size_t *);
235 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
236                               struct msghdr *, struct iovec *, size_t *);
237
238 #define SELECT_POKE_SHUTDOWN            (-1)
239 #define SELECT_POKE_NOTHING             (-2)
240 #define SELECT_POKE_READ                (-3)
241 #define SELECT_POKE_ACCEPT              (-3) /* Same as _READ */
242 #define SELECT_POKE_WRITE               (-4)
243 #define SELECT_POKE_CONNECT             (-4) /* Same as _WRITE */
244 #define SELECT_POKE_CLOSE               (-5)
245
246 #define SOCK_DEAD(s)                    ((s)->references == 0)
247
248 static void
249 manager_log(isc_socketmgr_t *sockmgr,
250             isc_logcategory_t *category, isc_logmodule_t *module, int level,
251             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
252 static void
253 manager_log(isc_socketmgr_t *sockmgr,
254             isc_logcategory_t *category, isc_logmodule_t *module, int level,
255             const char *fmt, ...)
256 {
257         char msgbuf[2048];
258         va_list ap;
259
260         if (! isc_log_wouldlog(isc_lctx, level))
261                 return;
262
263         va_start(ap, fmt);
264         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
265         va_end(ap);
266
267         isc_log_write(isc_lctx, category, module, level,
268                       "sockmgr %p: %s", sockmgr, msgbuf);
269 }
270
271 static void
272 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
273            isc_logcategory_t *category, isc_logmodule_t *module, int level,
274            isc_msgcat_t *msgcat, int msgset, int message,
275            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
276 static void
277 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
278            isc_logcategory_t *category, isc_logmodule_t *module, int level,
279            isc_msgcat_t *msgcat, int msgset, int message,
280            const char *fmt, ...)
281 {
282         char msgbuf[2048];
283         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
284         va_list ap;
285
286         if (! isc_log_wouldlog(isc_lctx, level))
287                 return;
288
289         va_start(ap, fmt);
290         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
291         va_end(ap);
292
293         if (address == NULL) {
294                 isc_log_iwrite(isc_lctx, category, module, level,
295                                msgcat, msgset, message,
296                                "socket %p: %s", sock, msgbuf);
297         } else {
298                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
299                 isc_log_iwrite(isc_lctx, category, module, level,
300                                msgcat, msgset, message,
301                                "socket %p %s: %s", sock, peerbuf, msgbuf);
302         }
303 }
304
305 static void
306 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
307         isc_socket_t *sock;
308
309         /*
310          * This is a wakeup on a socket.  If the socket is not in the
311          * process of being closed, start watching it for either reads
312          * or writes.
313          */
314
315         INSIST(fd >= 0 && fd < (int)FD_SETSIZE);
316
317         if (manager->fdstate[fd] == CLOSE_PENDING) {
318                 manager->fdstate[fd] = CLOSED;
319                 FD_CLR(fd, &manager->read_fds);
320                 FD_CLR(fd, &manager->write_fds);
321                 (void)close(fd);
322                 return;
323         }
324         if (manager->fdstate[fd] != MANAGED)
325                 return;
326
327         sock = manager->fds[fd];
328
329         /*
330          * Set requested bit.
331          */
332         if (msg == SELECT_POKE_READ)
333                 FD_SET(sock->fd, &manager->read_fds);
334         if (msg == SELECT_POKE_WRITE)
335                 FD_SET(sock->fd, &manager->write_fds);
336 }
337
338 #ifdef ISC_PLATFORM_USETHREADS
339 /*
340  * Poke the select loop when there is something for us to do.
341  * The write is required (by POSIX) to complete.  That is, we
342  * will not get partial writes.
343  */
344 static void
345 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
346         int cc;
347         int buf[2];
348         char strbuf[ISC_STRERRORSIZE];
349
350         buf[0] = fd;
351         buf[1] = msg;
352
353         do {
354                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
355 #ifdef ENOSR
356                 /*
357                  * Treat ENOSR as EAGAIN but loop slowly as it is
358                  * unlikely to clear fast.
359                  */
360                 if (cc < 0 && errno == ENOSR) {
361                         sleep(1);
362                         errno = EAGAIN;
363                 }
364 #endif
365         } while (cc < 0 && SOFT_ERROR(errno));
366
367         if (cc < 0) {
368                 isc__strerror(errno, strbuf, sizeof(strbuf));
369                 FATAL_ERROR(__FILE__, __LINE__,
370                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
371                                            ISC_MSG_WRITEFAILED,
372                                            "write() failed "
373                                            "during watcher poke: %s"),
374                             strbuf);
375         }
376
377         INSIST(cc == sizeof(buf));
378 }
379
380 /*
381  * Read a message on the internal fd.
382  */
383 static void
384 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
385         int buf[2];
386         int cc;
387         char strbuf[ISC_STRERRORSIZE];
388
389         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
390         if (cc < 0) {
391                 *msg = SELECT_POKE_NOTHING;
392                 *fd = -1;       /* Silence compiler. */
393                 if (SOFT_ERROR(errno))
394                         return;
395
396                 isc__strerror(errno, strbuf, sizeof(strbuf));
397                 FATAL_ERROR(__FILE__, __LINE__,
398                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
399                                            ISC_MSG_READFAILED,
400                                            "read() failed "
401                                            "during watcher poke: %s"),
402                             strbuf);
403                 
404                 return;
405         }
406         INSIST(cc == sizeof(buf));
407
408         *fd = buf[0];
409         *msg = buf[1];
410 }
411 #else /* ISC_PLATFORM_USETHREADS */
412 /*
413  * Update the state of the socketmgr when something changes.
414  */
415 static void
416 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
417         if (msg == SELECT_POKE_SHUTDOWN)
418                 return;
419         else if (fd >= 0)
420                 wakeup_socket(manager, fd, msg);
421         return;
422 }
423 #endif /* ISC_PLATFORM_USETHREADS */
424
425 /*
426  * Make a fd non-blocking.
427  */
428 static isc_result_t
429 make_nonblock(int fd) {
430         int ret;
431         int flags;
432         char strbuf[ISC_STRERRORSIZE];
433 #ifdef USE_FIONBIO_IOCTL
434         int on = 1;
435
436         ret = ioctl(fd, FIONBIO, (char *)&on);
437 #else
438         flags = fcntl(fd, F_GETFL, 0);
439         flags |= PORT_NONBLOCK;
440         ret = fcntl(fd, F_SETFL, flags);
441 #endif
442
443         if (ret == -1) {
444                 isc__strerror(errno, strbuf, sizeof(strbuf));
445                 UNEXPECTED_ERROR(__FILE__, __LINE__,
446 #ifdef USE_FIONBIO_IOCTL
447                                  "ioctl(%d, FIONBIO, &on): %s", fd,
448 #else
449                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
450 #endif
451                                  strbuf);
452
453                 return (ISC_R_UNEXPECTED);
454         }
455
456         return (ISC_R_SUCCESS);
457 }
458
459 #ifdef USE_CMSG
460 /*
461  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
462  * In order to ensure as much portability as possible, we provide wrapper
463  * functions of these macros.
464  * Note that cmsg_space() could run slow on OSes that do not have
465  * CMSG_SPACE.
466  */
467 static inline ISC_SOCKADDR_LEN_T
468 cmsg_len(ISC_SOCKADDR_LEN_T len) {
469 #ifdef CMSG_LEN
470         return (CMSG_LEN(len));
471 #else
472         ISC_SOCKADDR_LEN_T hdrlen;
473
474         /*
475          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
476          * is correct.
477          */
478         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
479         return (hdrlen + len);
480 #endif
481 }
482
483 static inline ISC_SOCKADDR_LEN_T
484 cmsg_space(ISC_SOCKADDR_LEN_T len) {
485 #ifdef CMSG_SPACE
486         return (CMSG_SPACE(len));
487 #else
488         struct msghdr msg;
489         struct cmsghdr *cmsgp;
490         /*
491          * XXX: The buffer length is an ad-hoc value, but should be enough
492          * in a practical sense.
493          */
494         char dummybuf[sizeof(struct cmsghdr) + 1024];
495
496         memset(&msg, 0, sizeof(msg));
497         msg.msg_control = dummybuf;
498         msg.msg_controllen = sizeof(dummybuf);
499
500         cmsgp = (struct cmsghdr *)dummybuf;
501         cmsgp->cmsg_len = cmsg_len(len);
502
503         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
504         if (cmsgp != NULL)
505                 return ((char *)cmsgp - (char *)msg.msg_control);
506         else
507                 return (0);
508 #endif  
509 }
510 #endif /* USE_CMSG */
511
512 /*
513  * Process control messages received on a socket.
514  */
515 static void
516 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
517 #ifdef USE_CMSG
518         struct cmsghdr *cmsgp;
519 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
520         struct in6_pktinfo *pktinfop;
521 #endif
522 #ifdef SO_TIMESTAMP
523         struct timeval *timevalp;
524 #endif
525 #endif
526
527         /*
528          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
529          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
530          * They are all here, outside of the CPP tests, because it is
531          * more consistent with the usual ISC coding style.
532          */
533         UNUSED(sock);
534         UNUSED(msg);
535         UNUSED(dev);
536
537 #ifdef ISC_NET_BSD44MSGHDR
538
539 #ifdef MSG_TRUNC
540         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
541                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
542 #endif
543
544 #ifdef MSG_CTRUNC
545         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
546                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
547 #endif
548
549 #ifndef USE_CMSG
550         return;
551 #else
552         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
553                 return;
554
555 #ifdef SO_TIMESTAMP
556         timevalp = NULL;
557 #endif
558 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
559         pktinfop = NULL;
560 #endif
561
562         cmsgp = CMSG_FIRSTHDR(msg);
563         while (cmsgp != NULL) {
564                 socket_log(sock, NULL, TRACE,
565                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
566                            "processing cmsg %p", cmsgp);
567
568 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
569                 if (cmsgp->cmsg_level == IPPROTO_IPV6
570                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
571
572                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
573                         memcpy(&dev->pktinfo, pktinfop,
574                                sizeof(struct in6_pktinfo));
575                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
576                         socket_log(sock, NULL, TRACE,
577                                    isc_msgcat, ISC_MSGSET_SOCKET,
578                                    ISC_MSG_IFRECEIVED,
579                                    "interface received on ifindex %u",
580                                    dev->pktinfo.ipi6_ifindex);
581                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
582                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;                         
583                         goto next;
584                 }
585 #endif
586
587 #ifdef SO_TIMESTAMP
588                 if (cmsgp->cmsg_level == SOL_SOCKET
589                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
590                         timevalp = (struct timeval *)CMSG_DATA(cmsgp);
591                         dev->timestamp.seconds = timevalp->tv_sec;
592                         dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
593                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
594                         goto next;
595                 }
596 #endif
597
598         next:
599                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
600         }
601 #endif /* USE_CMSG */
602
603 #endif /* ISC_NET_BSD44MSGHDR */
604 }
605
606 /*
607  * Construct an iov array and attach it to the msghdr passed in.  This is
608  * the SEND constructor, which will use the used region of the buffer
609  * (if using a buffer list) or will use the internal region (if a single
610  * buffer I/O is requested).
611  *
612  * Nothing can be NULL, and the done event must list at least one buffer
613  * on the buffer linked list for this function to be meaningful.
614  *
615  * If write_countp != NULL, *write_countp will hold the number of bytes
616  * this transaction can send.
617  */
618 static void
619 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
620                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
621 {
622         unsigned int iovcount;
623         isc_buffer_t *buffer;
624         isc_region_t used;
625         size_t write_count;
626         size_t skip_count;
627
628         memset(msg, 0, sizeof(*msg));
629
630         if (sock->type == isc_sockettype_udp) {
631                 msg->msg_name = (void *)&dev->address.type.sa;
632                 msg->msg_namelen = dev->address.length;
633         } else {
634                 msg->msg_name = NULL;
635                 msg->msg_namelen = 0;
636         }
637
638         buffer = ISC_LIST_HEAD(dev->bufferlist);
639         write_count = 0;
640         iovcount = 0;
641
642         /*
643          * Single buffer I/O?  Skip what we've done so far in this region.
644          */
645         if (buffer == NULL) {
646                 write_count = dev->region.length - dev->n;
647                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
648                 iov[0].iov_len = write_count;
649                 iovcount = 1;
650
651                 goto config;
652         }
653
654         /*
655          * Multibuffer I/O.
656          * Skip the data in the buffer list that we have already written.
657          */
658         skip_count = dev->n;
659         while (buffer != NULL) {
660                 REQUIRE(ISC_BUFFER_VALID(buffer));
661                 if (skip_count < isc_buffer_usedlength(buffer))
662                         break;
663                 skip_count -= isc_buffer_usedlength(buffer);
664                 buffer = ISC_LIST_NEXT(buffer, link);
665         }
666
667         while (buffer != NULL) {
668                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
669
670                 isc_buffer_usedregion(buffer, &used);
671
672                 if (used.length > 0) {
673                         iov[iovcount].iov_base = (void *)(used.base
674                                                           + skip_count);
675                         iov[iovcount].iov_len = used.length - skip_count;
676                         write_count += (used.length - skip_count);
677                         skip_count = 0;
678                         iovcount++;
679                 }
680                 buffer = ISC_LIST_NEXT(buffer, link);
681         }
682
683         INSIST(skip_count == 0U);
684
685  config:
686         msg->msg_iov = iov;
687         msg->msg_iovlen = iovcount;
688
689 #ifdef ISC_NET_BSD44MSGHDR
690         msg->msg_control = NULL;
691         msg->msg_controllen = 0;
692         msg->msg_flags = 0;
693 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
694         if ((sock->type == isc_sockettype_udp)
695             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
696                 struct cmsghdr *cmsgp;
697                 struct in6_pktinfo *pktinfop;
698
699                 socket_log(sock, NULL, TRACE,
700                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
701                            "sendto pktinfo data, ifindex %u",
702                            dev->pktinfo.ipi6_ifindex);
703
704                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
705                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
706                 msg->msg_control = (void *)sock->sendcmsgbuf;
707
708                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
709                 cmsgp->cmsg_level = IPPROTO_IPV6;
710                 cmsgp->cmsg_type = IPV6_PKTINFO;
711                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
712                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
713                 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
714         }
715 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
716 #else /* ISC_NET_BSD44MSGHDR */
717         msg->msg_accrights = NULL;
718         msg->msg_accrightslen = 0;
719 #endif /* ISC_NET_BSD44MSGHDR */
720
721         if (write_countp != NULL)
722                 *write_countp = write_count;
723 }
724
725 /*
726  * Construct an iov array and attach it to the msghdr passed in.  This is
727  * the RECV constructor, which will use the avialable region of the buffer
728  * (if using a buffer list) or will use the internal region (if a single
729  * buffer I/O is requested).
730  *
731  * Nothing can be NULL, and the done event must list at least one buffer
732  * on the buffer linked list for this function to be meaningful.
733  *
734  * If read_countp != NULL, *read_countp will hold the number of bytes
735  * this transaction can receive.
736  */
737 static void
738 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
739                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
740 {
741         unsigned int iovcount;
742         isc_buffer_t *buffer;
743         isc_region_t available;
744         size_t read_count;
745
746         memset(msg, 0, sizeof(struct msghdr));
747
748         if (sock->type == isc_sockettype_udp) {
749                 memset(&dev->address, 0, sizeof(dev->address));
750                 msg->msg_name = (void *)&dev->address.type.sa;
751                 msg->msg_namelen = sizeof(dev->address.type);
752 #ifdef ISC_NET_RECVOVERFLOW
753                 /* If needed, steal one iovec for overflow detection. */
754                 maxiov--;
755 #endif
756         } else { /* TCP */
757                 msg->msg_name = NULL;
758                 msg->msg_namelen = 0;
759                 dev->address = sock->address;
760         }
761
762         buffer = ISC_LIST_HEAD(dev->bufferlist);
763         read_count = 0;
764
765         /*
766          * Single buffer I/O?  Skip what we've done so far in this region.
767          */
768         if (buffer == NULL) {
769                 read_count = dev->region.length - dev->n;
770                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
771                 iov[0].iov_len = read_count;
772                 iovcount = 1;
773
774                 goto config;
775         }
776
777         /*
778          * Multibuffer I/O.
779          * Skip empty buffers.
780          */
781         while (buffer != NULL) {
782                 REQUIRE(ISC_BUFFER_VALID(buffer));
783                 if (isc_buffer_availablelength(buffer) != 0)
784                         break;
785                 buffer = ISC_LIST_NEXT(buffer, link);
786         }
787
788         iovcount = 0;
789         while (buffer != NULL) {
790                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
791
792                 isc_buffer_availableregion(buffer, &available);
793
794                 if (available.length > 0) {
795                         iov[iovcount].iov_base = (void *)(available.base);
796                         iov[iovcount].iov_len = available.length;
797                         read_count += available.length;
798                         iovcount++;
799                 }
800                 buffer = ISC_LIST_NEXT(buffer, link);
801         }
802
803  config:
804
805         /*
806          * If needed, set up to receive that one extra byte.  Note that
807          * we know there is at least one iov left, since we stole it
808          * at the top of this function.
809          */
810 #ifdef ISC_NET_RECVOVERFLOW
811         if (sock->type == isc_sockettype_udp) {
812                 iov[iovcount].iov_base = (void *)(&sock->overflow);
813                 iov[iovcount].iov_len = 1;
814                 iovcount++;
815         }
816 #endif
817
818         msg->msg_iov = iov;
819         msg->msg_iovlen = iovcount;
820
821 #ifdef ISC_NET_BSD44MSGHDR
822         msg->msg_control = NULL;
823         msg->msg_controllen = 0;
824         msg->msg_flags = 0;
825 #if defined(USE_CMSG)
826         if (sock->type == isc_sockettype_udp) {
827                 msg->msg_control = sock->recvcmsgbuf;
828                 msg->msg_controllen = sock->recvcmsgbuflen;
829         }
830 #endif /* USE_CMSG */
831 #else /* ISC_NET_BSD44MSGHDR */
832         msg->msg_accrights = NULL;
833         msg->msg_accrightslen = 0;
834 #endif /* ISC_NET_BSD44MSGHDR */
835
836         if (read_countp != NULL)
837                 *read_countp = read_count;
838 }
839
840 static void
841 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
842                 isc_socketevent_t *dev)
843 {
844         if (sock->type == isc_sockettype_udp) {
845                 if (address != NULL)
846                         dev->address = *address;
847                 else
848                         dev->address = sock->address;
849         } else if (sock->type == isc_sockettype_tcp) {
850                 INSIST(address == NULL);
851                 dev->address = sock->address;
852         }
853 }
854
855 static isc_socketevent_t *
856 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
857                      isc_taskaction_t action, const void *arg)
858 {
859         isc_socketevent_t *ev;
860
861         ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
862                                                      sock, eventtype,
863                                                      action, arg,
864                                                      sizeof(*ev));
865
866         if (ev == NULL)
867                 return (NULL);
868
869         ev->result = ISC_R_UNEXPECTED;
870         ISC_LINK_INIT(ev, ev_link);
871         ISC_LIST_INIT(ev->bufferlist);
872         ev->region.base = NULL;
873         ev->n = 0;
874         ev->offset = 0;
875         ev->attributes = 0;
876
877         return (ev);
878 }
879
880 #if defined(ISC_SOCKET_DEBUG)
881 static void
882 dump_msg(struct msghdr *msg) {
883         unsigned int i;
884
885         printf("MSGHDR %p\n", msg);
886         printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
887         printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
888         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
889                 printf("\t\t%d\tbase %p, len %d\n", i,
890                        msg->msg_iov[i].iov_base,
891                        msg->msg_iov[i].iov_len);
892 #ifdef ISC_NET_BSD44MSGHDR
893         printf("\tcontrol %p, controllen %d\n", msg->msg_control,
894                msg->msg_controllen);
895 #endif
896 }
897 #endif
898
899 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
900 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
901 #define DOIO_HARD               2       /* i/o error, event sent */
902 #define DOIO_EOF                3       /* EOF, no event sent */
903
904 static int
905 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
906         int cc;
907         struct iovec iov[MAXSCATTERGATHER_RECV];
908         size_t read_count;
909         size_t actual_count;
910         struct msghdr msghdr;
911         isc_buffer_t *buffer;
912         int recv_errno;
913         char strbuf[ISC_STRERRORSIZE];
914
915         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
916
917 #if defined(ISC_SOCKET_DEBUG)
918         dump_msg(&msghdr);
919 #endif
920
921         cc = recvmsg(sock->fd, &msghdr, 0);
922         recv_errno = errno;
923
924         if (cc < 0) {
925                 if (SOFT_ERROR(recv_errno))
926                         return (DOIO_SOFT);
927
928                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
929                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
930                         socket_log(sock, NULL, IOEVENT,
931                                    isc_msgcat, ISC_MSGSET_SOCKET,
932                                    ISC_MSG_DOIORECV, 
933                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
934                                    sock->fd, cc, recv_errno, strbuf);
935                 }
936
937 #define SOFT_OR_HARD(_system, _isc) \
938         if (recv_errno == _system) { \
939                 if (sock->connected) { \
940                         dev->result = _isc; \
941                         return (DOIO_HARD); \
942                 } \
943                 return (DOIO_SOFT); \
944         }
945 #define ALWAYS_HARD(_system, _isc) \
946         if (recv_errno == _system) { \
947                 dev->result = _isc; \
948                 return (DOIO_HARD); \
949         }
950
951                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
952                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
953                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
954                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
955                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
956                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
957                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
958
959 #undef SOFT_OR_HARD
960 #undef ALWAYS_HARD
961
962                 dev->result = isc__errno2result(recv_errno);
963                 return (DOIO_HARD);
964         }
965
966         /*
967          * On TCP, zero length reads indicate EOF, while on
968          * UDP, zero length reads are perfectly valid, although
969          * strange.
970          */
971         if ((sock->type == isc_sockettype_tcp) && (cc == 0))
972                 return (DOIO_EOF);
973
974         if (sock->type == isc_sockettype_udp) {
975                 dev->address.length = msghdr.msg_namelen;
976                 if (isc_sockaddr_getport(&dev->address) == 0) {
977                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
978                                 socket_log(sock, &dev->address, IOEVENT,
979                                            isc_msgcat, ISC_MSGSET_SOCKET,
980                                            ISC_MSG_ZEROPORT, 
981                                            "dropping source port zero packet");
982                         }
983                         return (DOIO_SOFT);
984                 }
985         }
986
987         socket_log(sock, &dev->address, IOEVENT,
988                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
989                    "packet received correctly");
990
991         /*
992          * Overflow bit detection.  If we received MORE bytes than we should,
993          * this indicates an overflow situation.  Set the flag in the
994          * dev entry and adjust how much we read by one.
995          */
996 #ifdef ISC_NET_RECVOVERFLOW
997         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
998                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
999                 cc--;
1000         }
1001 #endif
1002
1003         /*
1004          * If there are control messages attached, run through them and pull
1005          * out the interesting bits.
1006          */
1007         if (sock->type == isc_sockettype_udp)
1008                 process_cmsg(sock, &msghdr, dev);
1009
1010         /*
1011          * update the buffers (if any) and the i/o count
1012          */
1013         dev->n += cc;
1014         actual_count = cc;
1015         buffer = ISC_LIST_HEAD(dev->bufferlist);
1016         while (buffer != NULL && actual_count > 0U) {
1017                 REQUIRE(ISC_BUFFER_VALID(buffer));
1018                 if (isc_buffer_availablelength(buffer) <= actual_count) {
1019                         actual_count -= isc_buffer_availablelength(buffer);
1020                         isc_buffer_add(buffer,
1021                                        isc_buffer_availablelength(buffer));
1022                 } else {
1023                         isc_buffer_add(buffer, actual_count);
1024                         actual_count = 0;
1025                         break;
1026                 }
1027                 buffer = ISC_LIST_NEXT(buffer, link);
1028                 if (buffer == NULL) {
1029                         INSIST(actual_count == 0U);
1030                 }
1031         }
1032
1033         /*
1034          * If we read less than we expected, update counters,
1035          * and let the upper layer poke the descriptor.
1036          */
1037         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1038                 return (DOIO_SOFT);
1039
1040         /*
1041          * Full reads are posted, or partials if partials are ok.
1042          */
1043         dev->result = ISC_R_SUCCESS;
1044         return (DOIO_SUCCESS);
1045 }
1046
1047 /*
1048  * Returns:
1049  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
1050  *                      ISC_R_SUCCESS.
1051  *
1052  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
1053  *                      dev->result contains the appropriate error.
1054  *
1055  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
1056  *                      event was sent.  The operation should be retried.
1057  *
1058  *      No other return values are possible.
1059  */
1060 static int
1061 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1062         int cc;
1063         struct iovec iov[MAXSCATTERGATHER_SEND];
1064         size_t write_count;
1065         struct msghdr msghdr;
1066         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1067         int attempts = 0;
1068         int send_errno;
1069         char strbuf[ISC_STRERRORSIZE];
1070
1071         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1072
1073  resend:
1074         cc = sendmsg(sock->fd, &msghdr, 0);
1075         send_errno = errno;
1076
1077         /*
1078          * Check for error or block condition.
1079          */
1080         if (cc < 0) {
1081                 if (send_errno == EINTR && ++attempts < NRETRIES)
1082                         goto resend;
1083
1084                 if (SOFT_ERROR(send_errno))
1085                         return (DOIO_SOFT);
1086
1087 #define SOFT_OR_HARD(_system, _isc) \
1088         if (send_errno == _system) { \
1089                 if (sock->connected) { \
1090                         dev->result = _isc; \
1091                         return (DOIO_HARD); \
1092                 } \
1093                 return (DOIO_SOFT); \
1094         }
1095 #define ALWAYS_HARD(_system, _isc) \
1096         if (send_errno == _system) { \
1097                 dev->result = _isc; \
1098                 return (DOIO_HARD); \
1099         }
1100
1101                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1102                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1103                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1104                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1105                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1106 #ifdef EHOSTDOWN
1107                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1108 #endif
1109                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1110                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1111                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1112                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1113                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1114
1115 #undef SOFT_OR_HARD
1116 #undef ALWAYS_HARD
1117
1118                 /*
1119                  * The other error types depend on whether or not the
1120                  * socket is UDP or TCP.  If it is UDP, some errors
1121                  * that we expect to be fatal under TCP are merely
1122                  * annoying, and are really soft errors.
1123                  *
1124                  * However, these soft errors are still returned as
1125                  * a status.
1126                  */
1127                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1128                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1129                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1130                                  addrbuf, strbuf);
1131                 dev->result = isc__errno2result(send_errno);
1132                 return (DOIO_HARD);
1133         }
1134
1135         if (cc == 0)
1136                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1137                                  "internal_send: send() %s 0",
1138                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1139                                                 ISC_MSG_RETURNED, "returned"));
1140
1141         /*
1142          * If we write less than we expected, update counters, poke.
1143          */
1144         dev->n += cc;
1145         if ((size_t)cc != write_count)
1146                 return (DOIO_SOFT);
1147
1148         /*
1149          * Exactly what we wanted to write.  We're done with this
1150          * entry.  Post its completion event.
1151          */
1152         dev->result = ISC_R_SUCCESS;
1153         return (DOIO_SUCCESS);
1154 }
1155
1156 /*
1157  * Kill.
1158  *
1159  * Caller must ensure that the socket is not locked and no external
1160  * references exist.
1161  */
1162 static void
1163 destroy(isc_socket_t **sockp) {
1164         isc_socket_t *sock = *sockp;
1165         isc_socketmgr_t *manager = sock->manager;
1166
1167         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1168                    ISC_MSG_DESTROYING, "destroying");
1169
1170         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1171         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1172         INSIST(ISC_LIST_EMPTY(sock->send_list));
1173         INSIST(sock->connect_ev == NULL);
1174         REQUIRE(sock->fd >= 0 && sock->fd < (int)FD_SETSIZE);
1175
1176         LOCK(&manager->lock);
1177
1178         /*
1179          * No one has this socket open, so the watcher doesn't have to be
1180          * poked, and the socket doesn't have to be locked.
1181          */
1182         manager->fds[sock->fd] = NULL;
1183         manager->fdstate[sock->fd] = CLOSE_PENDING;
1184         select_poke(manager, sock->fd, SELECT_POKE_CLOSE);
1185         ISC_LIST_UNLINK(manager->socklist, sock, link);
1186
1187 #ifdef ISC_PLATFORM_USETHREADS
1188         if (ISC_LIST_EMPTY(manager->socklist))
1189                 SIGNAL(&manager->shutdown_ok);
1190 #endif /* ISC_PLATFORM_USETHREADS */
1191
1192         /*
1193          * XXX should reset manager->maxfd here
1194          */
1195
1196         UNLOCK(&manager->lock);
1197
1198         free_socket(sockp);
1199 }
1200
1201 static isc_result_t
1202 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1203                 isc_socket_t **socketp)
1204 {
1205         isc_socket_t *sock;
1206         isc_result_t ret;
1207         ISC_SOCKADDR_LEN_T cmsgbuflen;
1208
1209         sock = isc_mem_get(manager->mctx, sizeof(*sock));
1210
1211         if (sock == NULL)
1212                 return (ISC_R_NOMEMORY);
1213
1214         ret = ISC_R_UNEXPECTED;
1215
1216         sock->magic = 0;
1217         sock->references = 0;
1218
1219         sock->manager = manager;
1220         sock->type = type;
1221         sock->fd = -1;
1222
1223         ISC_LINK_INIT(sock, link);
1224
1225         sock->recvcmsgbuf = NULL;
1226         sock->sendcmsgbuf = NULL;
1227
1228         /*
1229          * set up cmsg buffers
1230          */
1231         cmsgbuflen = 0;
1232 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1233         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1234 #endif
1235 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1236         cmsgbuflen += cmsg_space(sizeof(struct timeval));
1237 #endif
1238         sock->recvcmsgbuflen = cmsgbuflen;
1239         if (sock->recvcmsgbuflen != 0U) {
1240                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1241                 if (sock->recvcmsgbuf == NULL)
1242                         goto error;
1243         }
1244
1245         cmsgbuflen = 0;
1246 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1247         cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1248 #endif
1249         sock->sendcmsgbuflen = cmsgbuflen;
1250         if (sock->sendcmsgbuflen != 0U) {
1251                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1252                 if (sock->sendcmsgbuf == NULL)
1253                         goto error;
1254         }
1255
1256         /*
1257          * set up list of readers and writers to be initially empty
1258          */
1259         ISC_LIST_INIT(sock->recv_list);
1260         ISC_LIST_INIT(sock->send_list);
1261         ISC_LIST_INIT(sock->accept_list);
1262         sock->connect_ev = NULL;
1263         sock->pending_recv = 0;
1264         sock->pending_send = 0;
1265         sock->pending_accept = 0;
1266         sock->listener = 0;
1267         sock->connected = 0;
1268         sock->connecting = 0;
1269         sock->bound = 0;
1270
1271         /*
1272          * initialize the lock
1273          */
1274         if (isc_mutex_init(&sock->lock) != ISC_R_SUCCESS) {
1275                 sock->magic = 0;
1276                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1277                                  "isc_mutex_init() %s",
1278                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1279                                                 ISC_MSG_FAILED, "failed"));
1280                 ret = ISC_R_UNEXPECTED;
1281                 goto error;
1282         }
1283
1284         /*
1285          * Initialize readable and writable events
1286          */
1287         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1288                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1289                        NULL, sock, sock, NULL, NULL);
1290         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1291                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1292                        NULL, sock, sock, NULL, NULL);
1293
1294         sock->magic = SOCKET_MAGIC;
1295         *socketp = sock;
1296
1297         return (ISC_R_SUCCESS);
1298
1299  error:
1300         if (sock->recvcmsgbuf != NULL)
1301                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1302                             sock->recvcmsgbuflen);
1303         if (sock->sendcmsgbuf != NULL)
1304                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1305                             sock->sendcmsgbuflen);
1306         isc_mem_put(manager->mctx, sock, sizeof(*sock));
1307
1308         return (ret);
1309 }
1310
1311 /*
1312  * This event requires that the various lists be empty, that the reference
1313  * count be 1, and that the magic number is valid.  The other socket bits,
1314  * like the lock, must be initialized as well.  The fd associated must be
1315  * marked as closed, by setting it to -1 on close, or this routine will
1316  * also close the socket.
1317  */
1318 static void
1319 free_socket(isc_socket_t **socketp) {
1320         isc_socket_t *sock = *socketp;
1321
1322         INSIST(sock->references == 0);
1323         INSIST(VALID_SOCKET(sock));
1324         INSIST(!sock->connecting);
1325         INSIST(!sock->pending_recv);
1326         INSIST(!sock->pending_send);
1327         INSIST(!sock->pending_accept);
1328         INSIST(ISC_LIST_EMPTY(sock->recv_list));
1329         INSIST(ISC_LIST_EMPTY(sock->send_list));
1330         INSIST(ISC_LIST_EMPTY(sock->accept_list));
1331         INSIST(!ISC_LINK_LINKED(sock, link));
1332
1333         if (sock->recvcmsgbuf != NULL)
1334                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1335                             sock->recvcmsgbuflen);
1336         if (sock->sendcmsgbuf != NULL)
1337                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1338                             sock->sendcmsgbuflen);
1339
1340         sock->magic = 0;
1341
1342         DESTROYLOCK(&sock->lock);
1343
1344         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1345
1346         *socketp = NULL;
1347 }
1348
1349 /*
1350  * Create a new 'type' socket managed by 'manager'.  Events
1351  * will be posted to 'task' and when dispatched 'action' will be
1352  * called with 'arg' as the arg value.  The new socket is returned
1353  * in 'socketp'.
1354  */
1355 isc_result_t
1356 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1357                   isc_socket_t **socketp)
1358 {
1359         isc_socket_t *sock = NULL;
1360         isc_result_t ret;
1361 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
1362         int on = 1;
1363 #endif
1364         char strbuf[ISC_STRERRORSIZE];
1365         const char *err = "socket";
1366
1367         REQUIRE(VALID_MANAGER(manager));
1368         REQUIRE(socketp != NULL && *socketp == NULL);
1369
1370         ret = allocate_socket(manager, type, &sock);
1371         if (ret != ISC_R_SUCCESS)
1372                 return (ret);
1373
1374         sock->pf = pf;
1375         switch (type) {
1376         case isc_sockettype_udp:
1377                 sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1378                 break;
1379         case isc_sockettype_tcp:
1380                 sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1381                 break;
1382         }
1383
1384 #ifdef F_DUPFD
1385         /*
1386          * Leave a space for stdio to work in.
1387          */
1388         if (sock->fd >= 0 && sock->fd < 20) {
1389                 int new, tmp;
1390                 new = fcntl(sock->fd, F_DUPFD, 20);
1391                 tmp = errno;
1392                 (void)close(sock->fd);
1393                 errno = tmp;
1394                 sock->fd = new;
1395                 err = "isc_socket_create: fcntl";
1396         }
1397 #endif
1398
1399         if (sock->fd >= (int)FD_SETSIZE) {
1400                 (void)close(sock->fd);
1401                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1402                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1403                                isc_msgcat, ISC_MSGSET_SOCKET,
1404                                ISC_MSG_TOOMANYFDS,
1405                                "%s: too many open file descriptors", "socket");
1406                 free_socket(&sock);
1407                 return (ISC_R_NORESOURCES);
1408         }
1409         
1410         if (sock->fd < 0) {
1411                 free_socket(&sock);
1412
1413                 switch (errno) {
1414                 case EMFILE:
1415                 case ENFILE:
1416                 case ENOBUFS:
1417                         return (ISC_R_NORESOURCES);
1418
1419                 case EPROTONOSUPPORT:
1420                 case EPFNOSUPPORT:
1421                 case EAFNOSUPPORT:
1422                 /*
1423                  * Linux 2.2 (and maybe others) return EINVAL instead of
1424                  * EAFNOSUPPORT.
1425                  */
1426                 case EINVAL:
1427                         return (ISC_R_FAMILYNOSUPPORT);
1428
1429                 default:
1430                         isc__strerror(errno, strbuf, sizeof(strbuf));
1431                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1432                                          "%s() %s: %s", err,
1433                                          isc_msgcat_get(isc_msgcat,
1434                                                         ISC_MSGSET_GENERAL,
1435                                                         ISC_MSG_FAILED,
1436                                                         "failed"),
1437                                          strbuf);
1438                         return (ISC_R_UNEXPECTED);
1439                 }
1440         }
1441
1442         if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
1443                 (void)close(sock->fd);
1444                 free_socket(&sock);
1445                 return (ISC_R_UNEXPECTED);
1446         }
1447
1448 #ifdef SO_BSDCOMPAT
1449         if (setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
1450                        (void *)&on, sizeof(on)) < 0) {
1451                 isc__strerror(errno, strbuf, sizeof(strbuf));
1452                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1453                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
1454                                  sock->fd,
1455                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1456                                                 ISC_MSG_FAILED, "failed"),
1457                                  strbuf);
1458                 /* Press on... */
1459         }
1460 #endif
1461
1462 #if defined(USE_CMSG)
1463         if (type == isc_sockettype_udp) {
1464
1465 #if defined(SO_TIMESTAMP)
1466                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1467                                (void *)&on, sizeof(on)) < 0
1468                     && errno != ENOPROTOOPT) {
1469                         isc__strerror(errno, strbuf, sizeof(strbuf));
1470                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1471                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1472                                          sock->fd, 
1473                                          isc_msgcat_get(isc_msgcat,
1474                                                         ISC_MSGSET_GENERAL,
1475                                                         ISC_MSG_FAILED,
1476                                                         "failed"),
1477                                          strbuf);
1478                         /* Press on... */
1479                 }
1480 #endif /* SO_TIMESTAMP */
1481
1482 #if defined(ISC_PLATFORM_HAVEIPV6)
1483                 if (pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
1484                         /*
1485                          * Warn explicitly because this anomaly can be hidden
1486                          * in usual operation (and unexpectedly appear later).
1487                          */
1488                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1489                                          "No buffer available to receive "
1490                                          "IPv6 destination");
1491                 }
1492 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1493 #ifdef IPV6_RECVPKTINFO
1494                 /* 2292bis */
1495                 if ((pf == AF_INET6)
1496                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1497                                    (void *)&on, sizeof(on)) < 0)) {
1498                         isc__strerror(errno, strbuf, sizeof(strbuf));
1499                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1500                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
1501                                          "%s: %s", sock->fd,
1502                                          isc_msgcat_get(isc_msgcat,
1503                                                         ISC_MSGSET_GENERAL,
1504                                                         ISC_MSG_FAILED,
1505                                                         "failed"),
1506                                          strbuf);
1507                 }
1508 #else
1509                 /* 2292 */
1510                 if ((pf == AF_INET6)
1511                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1512                                    (void *)&on, sizeof(on)) < 0)) {
1513                         isc__strerror(errno, strbuf, sizeof(strbuf));
1514                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1515                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1516                                          sock->fd,
1517                                          isc_msgcat_get(isc_msgcat,
1518                                                         ISC_MSGSET_GENERAL,
1519                                                         ISC_MSG_FAILED,
1520                                                         "failed"),
1521                                          strbuf);
1522                 }
1523 #endif /* IPV6_RECVPKTINFO */
1524 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
1525 #ifdef IPV6_USE_MIN_MTU        /*2292bis, not too common yet*/
1526                 /* use minimum MTU */
1527                 if (pf == AF_INET6) {
1528                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
1529                                          IPV6_USE_MIN_MTU,
1530                                          (void *)&on, sizeof(on));
1531                 }
1532 #endif
1533 #endif /* ISC_PLATFORM_HAVEIPV6 */
1534
1535         }
1536 #endif /* USE_CMSG */
1537
1538         sock->references = 1;
1539         *socketp = sock;
1540
1541         LOCK(&manager->lock);
1542
1543         /*
1544          * Note we don't have to lock the socket like we normally would because
1545          * there are no external references to it yet.
1546          */
1547
1548         manager->fds[sock->fd] = sock;
1549         manager->fdstate[sock->fd] = MANAGED;
1550         ISC_LIST_APPEND(manager->socklist, sock, link);
1551         if (manager->maxfd < sock->fd)
1552                 manager->maxfd = sock->fd;
1553
1554         UNLOCK(&manager->lock);
1555
1556         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1557                    ISC_MSG_CREATED, "created");
1558
1559         return (ISC_R_SUCCESS);
1560 }
1561
1562 /*
1563  * Attach to a socket.  Caller must explicitly detach when it is done.
1564  */
1565 void
1566 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1567         REQUIRE(VALID_SOCKET(sock));
1568         REQUIRE(socketp != NULL && *socketp == NULL);
1569
1570         LOCK(&sock->lock);
1571         sock->references++;
1572         UNLOCK(&sock->lock);
1573
1574         *socketp = sock;
1575 }
1576
1577 /*
1578  * Dereference a socket.  If this is the last reference to it, clean things
1579  * up by destroying the socket.
1580  */
1581 void
1582 isc_socket_detach(isc_socket_t **socketp) {
1583         isc_socket_t *sock;
1584         isc_boolean_t kill_socket = ISC_FALSE;
1585
1586         REQUIRE(socketp != NULL);
1587         sock = *socketp;
1588         REQUIRE(VALID_SOCKET(sock));
1589
1590         LOCK(&sock->lock);
1591         REQUIRE(sock->references > 0);
1592         sock->references--;
1593         if (sock->references == 0)
1594                 kill_socket = ISC_TRUE;
1595         UNLOCK(&sock->lock);
1596
1597         if (kill_socket)
1598                 destroy(&sock);
1599
1600         *socketp = NULL;
1601 }
1602
1603 /*
1604  * I/O is possible on a given socket.  Schedule an event to this task that
1605  * will call an internal function to do the I/O.  This will charge the
1606  * task with the I/O operation and let our select loop handler get back
1607  * to doing something real as fast as possible.
1608  *
1609  * The socket and manager must be locked before calling this function.
1610  */
1611 static void
1612 dispatch_recv(isc_socket_t *sock) {
1613         intev_t *iev;
1614         isc_socketevent_t *ev;
1615
1616         INSIST(!sock->pending_recv);
1617
1618         ev = ISC_LIST_HEAD(sock->recv_list);
1619         if (ev == NULL)
1620                 return;
1621
1622         sock->pending_recv = 1;
1623         iev = &sock->readable_ev;
1624
1625         socket_log(sock, NULL, EVENT, NULL, 0, 0,
1626                    "dispatch_recv:  event %p -> task %p", ev, ev->ev_sender);
1627
1628         sock->references++;
1629         iev->ev_sender = sock;
1630         iev->ev_action = internal_recv;
1631         iev->ev_arg = sock;
1632
1633         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1634 }
1635
1636 static void
1637 dispatch_send(isc_socket_t *sock) {
1638         intev_t *iev;
1639         isc_socketevent_t *ev;
1640
1641         INSIST(!sock->pending_send);
1642
1643         ev = ISC_LIST_HEAD(sock->send_list);
1644         if (ev == NULL)
1645                 return;
1646
1647         sock->pending_send = 1;
1648         iev = &sock->writable_ev;
1649
1650         socket_log(sock, NULL, EVENT, NULL, 0, 0,
1651                    "dispatch_send:  event %p -> task %p", ev, ev->ev_sender);
1652
1653         sock->references++;
1654         iev->ev_sender = sock;
1655         iev->ev_action = internal_send;
1656         iev->ev_arg = sock;
1657
1658         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1659 }
1660
1661 /*
1662  * Dispatch an internal accept event.
1663  */
1664 static void
1665 dispatch_accept(isc_socket_t *sock) {
1666         intev_t *iev;
1667         isc_socket_newconnev_t *ev;
1668
1669         INSIST(!sock->pending_accept);
1670
1671         /*
1672          * Are there any done events left, or were they all canceled
1673          * before the manager got the socket lock?
1674          */
1675         ev = ISC_LIST_HEAD(sock->accept_list);
1676         if (ev == NULL)
1677                 return;
1678
1679         sock->pending_accept = 1;
1680         iev = &sock->readable_ev;
1681
1682         sock->references++;  /* keep socket around for this internal event */
1683         iev->ev_sender = sock;
1684         iev->ev_action = internal_accept;
1685         iev->ev_arg = sock;
1686
1687         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1688 }
1689
1690 static void
1691 dispatch_connect(isc_socket_t *sock) {
1692         intev_t *iev;
1693         isc_socket_connev_t *ev;
1694
1695         iev = &sock->writable_ev;
1696
1697         ev = sock->connect_ev;
1698         INSIST(ev != NULL); /* XXX */
1699
1700         INSIST(sock->connecting);
1701
1702         sock->references++;  /* keep socket around for this internal event */
1703         iev->ev_sender = sock;
1704         iev->ev_action = internal_connect;
1705         iev->ev_arg = sock;
1706
1707         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1708 }
1709
1710 /*
1711  * Dequeue an item off the given socket's read queue, set the result code
1712  * in the done event to the one provided, and send it to the task it was
1713  * destined for.
1714  *
1715  * If the event to be sent is on a list, remove it before sending.  If
1716  * asked to, send and detach from the socket as well.
1717  *
1718  * Caller must have the socket locked if the event is attached to the socket.
1719  */
1720 static void
1721 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1722         isc_task_t *task;
1723
1724         task = (*dev)->ev_sender;
1725
1726         (*dev)->ev_sender = sock;
1727
1728         if (ISC_LINK_LINKED(*dev, ev_link))
1729                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1730
1731         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1732             == ISC_SOCKEVENTATTR_ATTACHED)
1733                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1734         else
1735                 isc_task_send(task, (isc_event_t **)dev);
1736 }
1737
1738 /*
1739  * See comments for send_recvdone_event() above.
1740  *
1741  * Caller must have the socket locked if the event is attached to the socket.
1742  */
1743 static void
1744 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1745         isc_task_t *task;
1746
1747         INSIST(dev != NULL && *dev != NULL);
1748
1749         task = (*dev)->ev_sender;
1750         (*dev)->ev_sender = sock;
1751
1752         if (ISC_LINK_LINKED(*dev, ev_link))
1753                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1754
1755         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1756             == ISC_SOCKEVENTATTR_ATTACHED)
1757                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1758         else
1759                 isc_task_send(task, (isc_event_t **)dev);
1760 }
1761
1762 /*
1763  * Call accept() on a socket, to get the new file descriptor.  The listen
1764  * socket is used as a prototype to create a new isc_socket_t.  The new
1765  * socket has one outstanding reference.  The task receiving the event
1766  * will be detached from just after the event is delivered.
1767  *
1768  * On entry to this function, the event delivered is the internal
1769  * readable event, and the first item on the accept_list should be
1770  * the done event we want to send.  If the list is empty, this is a no-op,
1771  * so just unlock and return.
1772  */
1773 static void
1774 internal_accept(isc_task_t *me, isc_event_t *ev) {
1775         isc_socket_t *sock;
1776         isc_socketmgr_t *manager;
1777         isc_socket_newconnev_t *dev;
1778         isc_task_t *task;
1779         ISC_SOCKADDR_LEN_T addrlen;
1780         int fd;
1781         isc_result_t result = ISC_R_SUCCESS;
1782         char strbuf[ISC_STRERRORSIZE];
1783         const char *err = "accept";
1784
1785         UNUSED(me);
1786
1787         sock = ev->ev_sender;
1788         INSIST(VALID_SOCKET(sock));
1789
1790         LOCK(&sock->lock);
1791         socket_log(sock, NULL, TRACE,
1792                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1793                    "internal_accept called, locked socket");
1794
1795         manager = sock->manager;
1796         INSIST(VALID_MANAGER(manager));
1797
1798         INSIST(sock->listener);
1799         INSIST(sock->pending_accept == 1);
1800         sock->pending_accept = 0;
1801
1802         INSIST(sock->references > 0);
1803         sock->references--;  /* the internal event is done with this socket */
1804         if (sock->references == 0) {
1805                 UNLOCK(&sock->lock);
1806                 destroy(&sock);
1807                 return;
1808         }
1809
1810         /*
1811          * Get the first item off the accept list.
1812          * If it is empty, unlock the socket and return.
1813          */
1814         dev = ISC_LIST_HEAD(sock->accept_list);
1815         if (dev == NULL) {
1816                 UNLOCK(&sock->lock);
1817                 return;
1818         }
1819
1820         /*
1821          * Try to accept the new connection.  If the accept fails with
1822          * EAGAIN or EINTR, simply poke the watcher to watch this socket
1823          * again.  Also ignore ECONNRESET, which has been reported to
1824          * be spuriously returned on Linux 2.2.19 although it is not
1825          * a documented error for accept().  ECONNABORTED has been
1826          * reported for Solaris 8.  The rest are thrown in not because
1827          * we have seen them but because they are ignored by other
1828          * deamons such as BIND 8 and Apache.
1829          */
1830
1831         addrlen = sizeof(dev->newsocket->address.type);
1832         memset(&dev->newsocket->address.type.sa, 0, addrlen);
1833         fd = accept(sock->fd, &dev->newsocket->address.type.sa,
1834                     (void *)&addrlen);
1835
1836 #ifdef F_DUPFD
1837         /*
1838          * Leave a space for stdio to work in.
1839          */
1840         if (fd >= 0 && fd < 20) {
1841                 int new, tmp;
1842                 new = fcntl(fd, F_DUPFD, 20);
1843                 tmp = errno;
1844                 (void)close(fd);
1845                 errno = tmp;
1846                 fd = new;
1847                 err = "fcntl";
1848         }
1849 #endif
1850
1851         if (fd < 0) {
1852                 if (SOFT_ERROR(errno))
1853                         goto soft_error;
1854                 switch (errno) {
1855                 case ENOBUFS:
1856                 case ENFILE:
1857                 case ENOMEM:
1858                 case ECONNRESET:
1859                 case ECONNABORTED:
1860                 case EHOSTUNREACH:
1861                 case EHOSTDOWN:
1862                 case ENETUNREACH:
1863                 case ENETDOWN:
1864                 case ECONNREFUSED:
1865 #ifdef EPROTO
1866                 case EPROTO:
1867 #endif
1868 #ifdef ENONET
1869                 case ENONET:
1870 #endif
1871                         goto soft_error;
1872                 default:
1873                         break;
1874                 }
1875                 isc__strerror(errno, strbuf, sizeof(strbuf));
1876                 UNEXPECTED_ERROR(__FILE__, __LINE__,
1877                                  "internal_accept: %s() %s: %s", err,
1878                                  isc_msgcat_get(isc_msgcat,
1879                                                 ISC_MSGSET_GENERAL,
1880                                                 ISC_MSG_FAILED,
1881                                                 "failed"),
1882                                  strbuf);
1883                 fd = -1;
1884                 result = ISC_R_UNEXPECTED;
1885         } else {
1886                 if (addrlen == 0U) {
1887                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1888                                          "internal_accept(): "
1889                                          "accept() failed to return "
1890                                          "remote address");
1891
1892                         (void)close(fd);
1893                         goto soft_error;
1894                 } else if (dev->newsocket->address.type.sa.sa_family !=
1895                            sock->pf)
1896                 {
1897                         UNEXPECTED_ERROR(__FILE__, __LINE__,
1898                                          "internal_accept(): "
1899                                          "accept() returned peer address "
1900                                          "family %u (expected %u)", 
1901                                          dev->newsocket->address.
1902                                          type.sa.sa_family,
1903                                          sock->pf);
1904                         (void)close(fd);
1905                         goto soft_error;
1906                 } else if (fd >= (int)FD_SETSIZE) {
1907                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1908                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1909                                        isc_msgcat, ISC_MSGSET_SOCKET,
1910                                        ISC_MSG_TOOMANYFDS,
1911                                        "%s: too many open file descriptors",
1912                                        "accept");
1913                         (void)close(fd);
1914                         goto soft_error;
1915                 }
1916         }
1917
1918         if (fd != -1) {
1919                 dev->newsocket->address.length = addrlen;
1920                 dev->newsocket->pf = sock->pf;
1921         }
1922
1923         /*
1924          * Pull off the done event.
1925          */
1926         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
1927
1928         /*
1929          * Poke watcher if there are more pending accepts.
1930          */
1931         if (!ISC_LIST_EMPTY(sock->accept_list))
1932                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
1933
1934         UNLOCK(&sock->lock);
1935
1936         if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
1937                 (void)close(fd);
1938                 fd = -1;
1939                 result = ISC_R_UNEXPECTED;
1940         }
1941
1942         /*
1943          * -1 means the new socket didn't happen.
1944          */
1945         if (fd != -1) {
1946                 LOCK(&manager->lock);
1947                 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
1948
1949                 dev->newsocket->fd = fd;
1950                 dev->newsocket->bound = 1;
1951                 dev->newsocket->connected = 1;
1952
1953                 /*
1954                  * Save away the remote address
1955                  */
1956                 dev->address = dev->newsocket->address;
1957
1958                 manager->fds[fd] = dev->newsocket;
1959                 manager->fdstate[fd] = MANAGED;
1960                 if (manager->maxfd < fd)
1961                         manager->maxfd = fd;
1962
1963                 socket_log(sock, &dev->newsocket->address, CREATION,
1964                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
1965                            "accepted connection, new socket %p",
1966                            dev->newsocket);
1967
1968                 UNLOCK(&manager->lock);
1969         } else {
1970                 dev->newsocket->references--;
1971                 free_socket(&dev->newsocket);
1972         }
1973         
1974         /*
1975          * Fill in the done event details and send it off.
1976          */
1977         dev->result = result;
1978         task = dev->ev_sender;
1979         dev->ev_sender = sock;
1980
1981         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
1982         return;
1983
1984  soft_error:
1985         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
1986         UNLOCK(&sock->lock);
1987         return;
1988 }
1989
1990 static void
1991 internal_recv(isc_task_t *me, isc_event_t *ev) {
1992         isc_socketevent_t *dev;
1993         isc_socket_t *sock;
1994
1995         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1996
1997         sock = ev->ev_sender;
1998         INSIST(VALID_SOCKET(sock));
1999
2000         LOCK(&sock->lock);
2001         socket_log(sock, NULL, IOEVENT,
2002                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2003                    "internal_recv: task %p got event %p", me, ev);
2004
2005         INSIST(sock->pending_recv == 1);
2006         sock->pending_recv = 0;
2007
2008         INSIST(sock->references > 0);
2009         sock->references--;  /* the internal event is done with this socket */
2010         if (sock->references == 0) {
2011                 UNLOCK(&sock->lock);
2012                 destroy(&sock);
2013                 return;
2014         }
2015
2016         /*
2017          * Try to do as much I/O as possible on this socket.  There are no
2018          * limits here, currently.
2019          */
2020         dev = ISC_LIST_HEAD(sock->recv_list);
2021         while (dev != NULL) {
2022                 switch (doio_recv(sock, dev)) {
2023                 case DOIO_SOFT:
2024                         goto poke;
2025
2026                 case DOIO_EOF:
2027                         /*
2028                          * read of 0 means the remote end was closed.
2029                          * Run through the event queue and dispatch all
2030                          * the events with an EOF result code.
2031                          */
2032                         do {
2033                                 dev->result = ISC_R_EOF;
2034                                 send_recvdone_event(sock, &dev);
2035                                 dev = ISC_LIST_HEAD(sock->recv_list);
2036                         } while (dev != NULL);
2037                         goto poke;
2038
2039                 case DOIO_SUCCESS:
2040                 case DOIO_HARD:
2041                         send_recvdone_event(sock, &dev);
2042                         break;
2043                 }
2044
2045                 dev = ISC_LIST_HEAD(sock->recv_list);
2046         }
2047
2048  poke:
2049         if (!ISC_LIST_EMPTY(sock->recv_list))
2050                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2051
2052         UNLOCK(&sock->lock);
2053 }
2054
2055 static void
2056 internal_send(isc_task_t *me, isc_event_t *ev) {
2057         isc_socketevent_t *dev;
2058         isc_socket_t *sock;
2059
2060         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2061
2062         /*
2063          * Find out what socket this is and lock it.
2064          */
2065         sock = (isc_socket_t *)ev->ev_sender;
2066         INSIST(VALID_SOCKET(sock));
2067
2068         LOCK(&sock->lock);
2069         socket_log(sock, NULL, IOEVENT,
2070                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2071                    "internal_send: task %p got event %p", me, ev);
2072
2073         INSIST(sock->pending_send == 1);
2074         sock->pending_send = 0;
2075
2076         INSIST(sock->references > 0);
2077         sock->references--;  /* the internal event is done with this socket */
2078         if (sock->references == 0) {
2079                 UNLOCK(&sock->lock);
2080                 destroy(&sock);
2081                 return;
2082         }
2083
2084         /*
2085          * Try to do as much I/O as possible on this socket.  There are no
2086          * limits here, currently.
2087          */
2088         dev = ISC_LIST_HEAD(sock->send_list);
2089         while (dev != NULL) {
2090                 switch (doio_send(sock, dev)) {
2091                 case DOIO_SOFT:
2092                         goto poke;
2093
2094                 case DOIO_HARD:
2095                 case DOIO_SUCCESS:
2096                         send_senddone_event(sock, &dev);
2097                         break;
2098                 }
2099
2100                 dev = ISC_LIST_HEAD(sock->send_list);
2101         }
2102
2103  poke:
2104         if (!ISC_LIST_EMPTY(sock->send_list))
2105                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2106
2107         UNLOCK(&sock->lock);
2108 }
2109
2110 static void
2111 process_fds(isc_socketmgr_t *manager, int maxfd,
2112             fd_set *readfds, fd_set *writefds)
2113 {
2114         int i;
2115         isc_socket_t *sock;
2116         isc_boolean_t unlock_sock;
2117
2118         REQUIRE(maxfd <= (int)FD_SETSIZE);
2119
2120         /*
2121          * Process read/writes on other fds here.  Avoid locking
2122          * and unlocking twice if both reads and writes are possible.
2123          */
2124         for (i = 0; i < maxfd; i++) {
2125 #ifdef ISC_PLATFORM_USETHREADS
2126                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
2127                         continue;
2128 #endif /* ISC_PLATFORM_USETHREADS */
2129
2130                 if (manager->fdstate[i] == CLOSE_PENDING) {
2131                         manager->fdstate[i] = CLOSED;
2132                         FD_CLR(i, &manager->read_fds);
2133                         FD_CLR(i, &manager->write_fds);
2134
2135                         (void)close(i);
2136
2137                         continue;
2138                 }
2139
2140                 sock = manager->fds[i];
2141                 unlock_sock = ISC_FALSE;
2142                 if (FD_ISSET(i, readfds)) {
2143                         if (sock == NULL) {
2144                                 FD_CLR(i, &manager->read_fds);
2145                                 goto check_write;
2146                         }
2147                         unlock_sock = ISC_TRUE;
2148                         LOCK(&sock->lock);
2149                         if (!SOCK_DEAD(sock)) {
2150                                 if (sock->listener)
2151                                         dispatch_accept(sock);
2152                                 else
2153                                         dispatch_recv(sock);
2154                         }
2155                         FD_CLR(i, &manager->read_fds);
2156                 }
2157         check_write:
2158                 if (FD_ISSET(i, writefds)) {
2159                         if (sock == NULL) {
2160                                 FD_CLR(i, &manager->write_fds);
2161                                 continue;
2162                         }
2163                         if (!unlock_sock) {
2164                                 unlock_sock = ISC_TRUE;
2165                                 LOCK(&sock->lock);
2166                         }
2167                         if (!SOCK_DEAD(sock)) {
2168                                 if (sock->connecting)
2169                                         dispatch_connect(sock);
2170                                 else
2171                                         dispatch_send(sock);
2172                         }
2173                         FD_CLR(i, &manager->write_fds);
2174                 }
2175                 if (unlock_sock)
2176                         UNLOCK(&sock->lock);
2177         }
2178 }
2179
2180 #ifdef ISC_PLATFORM_USETHREADS
2181 /*
2182  * This is the thread that will loop forever, always in a select or poll
2183  * call.
2184  *
2185  * When select returns something to do, track down what thread gets to do
2186  * this I/O and post the event to it.
2187  */
2188 static isc_threadresult_t
2189 watcher(void *uap) {
2190         isc_socketmgr_t *manager = uap;
2191         isc_boolean_t done;
2192         int ctlfd;
2193         int cc;
2194         fd_set readfds;
2195         fd_set writefds;
2196         int msg, fd;
2197         int maxfd;
2198         char strbuf[ISC_STRERRORSIZE];
2199
2200         /*
2201          * Get the control fd here.  This will never change.
2202          */
2203         LOCK(&manager->lock);
2204         ctlfd = manager->pipe_fds[0];
2205
2206         done = ISC_FALSE;
2207         while (!done) {
2208                 do {
2209                         readfds = manager->read_fds;
2210                         writefds = manager->write_fds;
2211                         maxfd = manager->maxfd + 1;
2212
2213                         UNLOCK(&manager->lock);
2214
2215                         cc = select(maxfd, &readfds, &writefds, NULL, NULL);
2216                         if (cc < 0) {
2217                                 if (!SOFT_ERROR(errno)) {
2218                                         isc__strerror(errno, strbuf,
2219                                                       sizeof(strbuf));
2220                                         FATAL_ERROR(__FILE__, __LINE__,
2221                                                     "select() %s: %s",
2222                                                     isc_msgcat_get(isc_msgcat,
2223                                                             ISC_MSGSET_GENERAL,
2224                                                             ISC_MSG_FAILED,
2225                                                             "failed"),
2226                                                     strbuf);
2227                                 }
2228                         }
2229
2230                         LOCK(&manager->lock);
2231                 } while (cc < 0);
2232
2233
2234                 /*
2235                  * Process reads on internal, control fd.
2236                  */
2237                 if (FD_ISSET(ctlfd, &readfds)) {
2238                         for (;;) {
2239                                 select_readmsg(manager, &fd, &msg);
2240
2241                                 manager_log(manager, IOEVENT,
2242                                             isc_msgcat_get(isc_msgcat,
2243                                                      ISC_MSGSET_SOCKET,
2244                                                      ISC_MSG_WATCHERMSG,
2245                                                      "watcher got message %d"),
2246                                                      msg);
2247
2248                                 /*
2249                                  * Nothing to read?
2250                                  */
2251                                 if (msg == SELECT_POKE_NOTHING)
2252                                         break;
2253
2254                                 /*
2255                                  * Handle shutdown message.  We really should
2256                                  * jump out of this loop right away, but
2257                                  * it doesn't matter if we have to do a little
2258                                  * more work first.
2259                                  */
2260                                 if (msg == SELECT_POKE_SHUTDOWN) {
2261                                         done = ISC_TRUE;
2262
2263                                         break;
2264                                 }
2265
2266                                 /*
2267                                  * This is a wakeup on a socket.  Look
2268                                  * at the event queue for both read and write,
2269                                  * and decide if we need to watch on it now
2270                                  * or not.
2271                                  */
2272                                 wakeup_socket(manager, fd, msg);
2273                         }
2274                 }
2275
2276                 process_fds(manager, maxfd, &readfds, &writefds);
2277         }
2278
2279         manager_log(manager, TRACE,
2280                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2281                                    ISC_MSG_EXITING, "watcher exiting"));
2282
2283         UNLOCK(&manager->lock);
2284         return ((isc_threadresult_t)0);
2285 }
2286 #endif /* ISC_PLATFORM_USETHREADS */
2287
2288 /*
2289  * Create a new socket manager.
2290  */
2291 isc_result_t
2292 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2293         isc_socketmgr_t *manager;
2294 #ifdef ISC_PLATFORM_USETHREADS
2295         char strbuf[ISC_STRERRORSIZE];
2296 #endif
2297
2298         REQUIRE(managerp != NULL && *managerp == NULL);
2299
2300 #ifndef ISC_PLATFORM_USETHREADS
2301         if (socketmgr != NULL) {
2302                 socketmgr->refs++;
2303                 *managerp = socketmgr;
2304                 return (ISC_R_SUCCESS);
2305         }
2306 #endif /* ISC_PLATFORM_USETHREADS */
2307
2308         manager = isc_mem_get(mctx, sizeof(*manager));
2309         if (manager == NULL)
2310                 return (ISC_R_NOMEMORY);
2311
2312         manager->magic = SOCKET_MANAGER_MAGIC;
2313         manager->mctx = NULL;
2314         memset(manager->fds, 0, sizeof(manager->fds));
2315         ISC_LIST_INIT(manager->socklist);
2316         if (isc_mutex_init(&manager->lock) != ISC_R_SUCCESS) {
2317                 isc_mem_put(mctx, manager, sizeof(*manager));
2318                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2319                                  "isc_mutex_init() %s",
2320                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2321                                                 ISC_MSG_FAILED, "failed"));
2322                 return (ISC_R_UNEXPECTED);
2323         }
2324 #ifdef ISC_PLATFORM_USETHREADS
2325         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2326                 DESTROYLOCK(&manager->lock);
2327                 isc_mem_put(mctx, manager, sizeof(*manager));
2328                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2329                                  "isc_condition_init() %s",
2330                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2331                                                 ISC_MSG_FAILED, "failed"));
2332                 return (ISC_R_UNEXPECTED);
2333         }
2334
2335         /*
2336          * Create the special fds that will be used to wake up the
2337          * select/poll loop when something internal needs to be done.
2338          */
2339         if (pipe(manager->pipe_fds) != 0) {
2340                 DESTROYLOCK(&manager->lock);
2341                 isc_mem_put(mctx, manager, sizeof(*manager));
2342                 isc__strerror(errno, strbuf, sizeof(strbuf));
2343                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2344                                  "pipe() %s: %s",
2345                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2346                                                 ISC_MSG_FAILED, "failed"),
2347                                  strbuf);
2348
2349                 return (ISC_R_UNEXPECTED);
2350         }
2351
2352         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
2353 #if 0
2354         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
2355 #endif
2356 #else /* ISC_PLATFORM_USETHREADS */
2357         manager->refs = 1;
2358 #endif /* ISC_PLATFORM_USETHREADS */
2359
2360         /*
2361          * Set up initial state for the select loop
2362          */
2363         FD_ZERO(&manager->read_fds);
2364         FD_ZERO(&manager->write_fds);
2365 #ifdef ISC_PLATFORM_USETHREADS
2366         FD_SET(manager->pipe_fds[0], &manager->read_fds);
2367         manager->maxfd = manager->pipe_fds[0];
2368 #else /* ISC_PLATFORM_USETHREADS */
2369         manager->maxfd = 0;
2370 #endif /* ISC_PLATFORM_USETHREADS */
2371         memset(manager->fdstate, 0, sizeof(manager->fdstate));
2372
2373 #ifdef ISC_PLATFORM_USETHREADS
2374         /*
2375          * Start up the select/poll thread.
2376          */
2377         if (isc_thread_create(watcher, manager, &manager->watcher) !=
2378             ISC_R_SUCCESS) {
2379                 (void)close(manager->pipe_fds[0]);
2380                 (void)close(manager->pipe_fds[1]);
2381                 DESTROYLOCK(&manager->lock);
2382                 isc_mem_put(mctx, manager, sizeof(*manager));
2383                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2384                                  "isc_thread_create() %s",
2385                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2386                                                 ISC_MSG_FAILED, "failed"));
2387                 return (ISC_R_UNEXPECTED);
2388         }
2389 #endif /* ISC_PLATFORM_USETHREADS */
2390         isc_mem_attach(mctx, &manager->mctx);
2391
2392 #ifndef ISC_PLATFORM_USETHREADS
2393         socketmgr = manager;
2394 #endif /* ISC_PLATFORM_USETHREADS */
2395         *managerp = manager;
2396
2397         return (ISC_R_SUCCESS);
2398 }
2399
2400 void
2401 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
2402         isc_socketmgr_t *manager;
2403         int i;
2404         isc_mem_t *mctx;
2405
2406         /*
2407          * Destroy a socket manager.
2408          */
2409
2410         REQUIRE(managerp != NULL);
2411         manager = *managerp;
2412         REQUIRE(VALID_MANAGER(manager));
2413
2414 #ifndef ISC_PLATFORM_USETHREADS
2415         if (manager->refs > 1) {
2416                 manager->refs--;
2417                 *managerp = NULL;
2418                 return;
2419         }
2420 #endif /* ISC_PLATFORM_USETHREADS */
2421
2422         LOCK(&manager->lock);
2423
2424 #ifdef ISC_PLATFORM_USETHREADS
2425         /*
2426          * Wait for all sockets to be destroyed.
2427          */
2428         while (!ISC_LIST_EMPTY(manager->socklist)) {
2429                 manager_log(manager, CREATION,
2430                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2431                                            ISC_MSG_SOCKETSREMAIN,
2432                                            "sockets exist"));
2433                 WAIT(&manager->shutdown_ok, &manager->lock);
2434         }
2435 #else /* ISC_PLATFORM_USETHREADS */
2436         /*
2437          * Hope all sockets have been destroyed.
2438          */
2439         if (!ISC_LIST_EMPTY(manager->socklist)) {
2440                 manager_log(manager, CREATION,
2441                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2442                                            ISC_MSG_SOCKETSREMAIN,
2443                                            "sockets exist"));
2444                 INSIST(0);
2445         }
2446 #endif /* ISC_PLATFORM_USETHREADS */
2447
2448         UNLOCK(&manager->lock);
2449
2450         /*
2451          * Here, poke our select/poll thread.  Do this by closing the write
2452          * half of the pipe, which will send EOF to the read half.
2453          * This is currently a no-op in the non-threaded case.
2454          */
2455         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2456
2457 #ifdef ISC_PLATFORM_USETHREADS
2458         /*
2459          * Wait for thread to exit.
2460          */
2461         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
2462                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2463                                  "isc_thread_join() %s",
2464                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2465                                                 ISC_MSG_FAILED, "failed"));
2466 #endif /* ISC_PLATFORM_USETHREADS */
2467
2468         /*
2469          * Clean up.
2470          */
2471 #ifdef ISC_PLATFORM_USETHREADS
2472         (void)close(manager->pipe_fds[0]);
2473         (void)close(manager->pipe_fds[1]);
2474         (void)isc_condition_destroy(&manager->shutdown_ok);
2475 #endif /* ISC_PLATFORM_USETHREADS */
2476
2477         for (i = 0; i < (int)FD_SETSIZE; i++)
2478                 if (manager->fdstate[i] == CLOSE_PENDING)
2479                         (void)close(i);
2480
2481         DESTROYLOCK(&manager->lock);
2482         manager->magic = 0;
2483         mctx= manager->mctx;
2484         isc_mem_put(mctx, manager, sizeof(*manager));
2485
2486         isc_mem_detach(&mctx);
2487
2488         *managerp = NULL;
2489 }
2490
2491 static isc_result_t
2492 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2493             unsigned int flags)
2494 {
2495         int io_state;
2496         isc_boolean_t have_lock = ISC_FALSE;
2497         isc_task_t *ntask = NULL;
2498         isc_result_t result = ISC_R_SUCCESS;
2499
2500         dev->ev_sender = task;
2501
2502         if (sock->type == isc_sockettype_udp) {
2503                 io_state = doio_recv(sock, dev);
2504         } else {
2505                 LOCK(&sock->lock);
2506                 have_lock = ISC_TRUE;
2507
2508                 if (ISC_LIST_EMPTY(sock->recv_list))
2509                         io_state = doio_recv(sock, dev);
2510                 else
2511                         io_state = DOIO_SOFT;
2512         }
2513
2514         switch (io_state) {
2515         case DOIO_SOFT:
2516                 /*
2517                  * We couldn't read all or part of the request right now, so
2518                  * queue it.
2519                  *
2520                  * Attach to socket and to task
2521                  */
2522                 isc_task_attach(task, &ntask);
2523                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2524
2525                 if (!have_lock) {
2526                         LOCK(&sock->lock);
2527                         have_lock = ISC_TRUE;
2528                 }
2529
2530                 /*
2531                  * Enqueue the request.  If the socket was previously not being
2532                  * watched, poke the watcher to start paying attention to it.
2533                  */
2534                 if (ISC_LIST_EMPTY(sock->recv_list))
2535                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2536                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2537
2538                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2539                            "socket_recv: event %p -> task %p",
2540                            dev, ntask);
2541
2542                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2543                         result = ISC_R_INPROGRESS;
2544                 break;
2545
2546         case DOIO_EOF:
2547                 dev->result = ISC_R_EOF;
2548                 /* fallthrough */
2549
2550         case DOIO_HARD:
2551         case DOIO_SUCCESS:
2552                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2553                         send_recvdone_event(sock, &dev);
2554                 break;
2555         }
2556
2557         if (have_lock)
2558                 UNLOCK(&sock->lock);
2559
2560         return (result);
2561 }
2562
2563 isc_result_t
2564 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2565                  unsigned int minimum, isc_task_t *task,
2566                  isc_taskaction_t action, const void *arg)
2567 {
2568         isc_socketevent_t *dev;
2569         isc_socketmgr_t *manager;
2570         unsigned int iocount;
2571         isc_buffer_t *buffer;
2572
2573         REQUIRE(VALID_SOCKET(sock));
2574         REQUIRE(buflist != NULL);
2575         REQUIRE(!ISC_LIST_EMPTY(*buflist));
2576         REQUIRE(task != NULL);
2577         REQUIRE(action != NULL);
2578
2579         manager = sock->manager;
2580         REQUIRE(VALID_MANAGER(manager));
2581
2582         iocount = isc_bufferlist_availablecount(buflist);
2583         REQUIRE(iocount > 0);
2584
2585         INSIST(sock->bound);
2586
2587         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2588         if (dev == NULL) {
2589                 return (ISC_R_NOMEMORY);
2590         }
2591
2592         /*
2593          * UDP sockets are always partial read
2594          */
2595         if (sock->type == isc_sockettype_udp)
2596                 dev->minimum = 1;
2597         else {
2598                 if (minimum == 0)
2599                         dev->minimum = iocount;
2600                 else
2601                         dev->minimum = minimum;
2602         }
2603
2604         /*
2605          * Move each buffer from the passed in list to our internal one.
2606          */
2607         buffer = ISC_LIST_HEAD(*buflist);
2608         while (buffer != NULL) {
2609                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2610                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2611                 buffer = ISC_LIST_HEAD(*buflist);
2612         }
2613
2614         return (socket_recv(sock, dev, task, 0));
2615 }
2616
2617 isc_result_t
2618 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
2619                 isc_task_t *task, isc_taskaction_t action, const void *arg)
2620 {
2621         isc_socketevent_t *dev;
2622         isc_socketmgr_t *manager;
2623
2624         REQUIRE(VALID_SOCKET(sock));
2625         REQUIRE(action != NULL);
2626
2627         manager = sock->manager;
2628         REQUIRE(VALID_MANAGER(manager));
2629
2630         INSIST(sock->bound);
2631
2632         dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2633         if (dev == NULL)
2634                 return (ISC_R_NOMEMORY);
2635
2636         return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
2637 }
2638
2639 isc_result_t
2640 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
2641                  unsigned int minimum, isc_task_t *task,
2642                  isc_socketevent_t *event, unsigned int flags)
2643 {
2644         event->ev_sender = sock;
2645         event->result = ISC_R_UNEXPECTED;
2646         ISC_LIST_INIT(event->bufferlist);
2647         event->region = *region;
2648         event->n = 0;
2649         event->offset = 0;
2650         event->attributes = 0;
2651
2652         /*
2653          * UDP sockets are always partial read.
2654          */
2655         if (sock->type == isc_sockettype_udp)
2656                 event->minimum = 1;
2657         else {
2658                 if (minimum == 0)
2659                         event->minimum = region->length;
2660                 else
2661                         event->minimum = minimum;
2662         }
2663
2664         return (socket_recv(sock, event, task, flags));
2665 }
2666
2667 static isc_result_t
2668 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2669             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2670             unsigned int flags)
2671 {
2672         int io_state;
2673         isc_boolean_t have_lock = ISC_FALSE;
2674         isc_task_t *ntask = NULL;
2675         isc_result_t result = ISC_R_SUCCESS;
2676
2677         dev->ev_sender = task;
2678
2679         set_dev_address(address, sock, dev);
2680         if (pktinfo != NULL) {
2681                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2682                 dev->pktinfo = *pktinfo;
2683
2684                 if (!isc_sockaddr_issitelocal(address) &&
2685                     !isc_sockaddr_islinklocal(address)) {
2686                         socket_log(sock, NULL, TRACE, isc_msgcat,
2687                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
2688                                    "pktinfo structure provided, ifindex %u "
2689                                    "(set to 0)", pktinfo->ipi6_ifindex);
2690
2691                         /*
2692                          * Set the pktinfo index to 0 here, to let the
2693                          * kernel decide what interface it should send on.
2694                          */
2695                         dev->pktinfo.ipi6_ifindex = 0;
2696                 }
2697         }
2698
2699         if (sock->type == isc_sockettype_udp)
2700                 io_state = doio_send(sock, dev);
2701         else {
2702                 LOCK(&sock->lock);
2703                 have_lock = ISC_TRUE;
2704
2705                 if (ISC_LIST_EMPTY(sock->send_list))
2706                         io_state = doio_send(sock, dev);
2707                 else
2708                         io_state = DOIO_SOFT;
2709         }
2710
2711         switch (io_state) {
2712         case DOIO_SOFT:
2713                 /*
2714                  * We couldn't send all or part of the request right now, so
2715                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
2716                  */
2717                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2718                         isc_task_attach(task, &ntask);
2719                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2720
2721                         if (!have_lock) {
2722                                 LOCK(&sock->lock);
2723                                 have_lock = ISC_TRUE;
2724                         }
2725
2726                         /*
2727                          * Enqueue the request.  If the socket was previously
2728                          * not being watched, poke the watcher to start
2729                          * paying attention to it.
2730                          */
2731                         if (ISC_LIST_EMPTY(sock->send_list))
2732                                 select_poke(sock->manager, sock->fd,
2733                                             SELECT_POKE_WRITE);
2734                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2735
2736                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
2737                                    "socket_send: event %p -> task %p",
2738                                    dev, ntask);
2739
2740                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2741                                 result = ISC_R_INPROGRESS;
2742                         break;
2743                 }
2744
2745         case DOIO_HARD:
2746         case DOIO_SUCCESS:
2747                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2748                         send_senddone_event(sock, &dev);
2749                 break;
2750         }
2751
2752         if (have_lock)
2753                 UNLOCK(&sock->lock);
2754
2755         return (result);
2756 }
2757
2758 isc_result_t
2759 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
2760                 isc_task_t *task, isc_taskaction_t action, const void *arg)
2761 {
2762         /*
2763          * REQUIRE() checking is performed in isc_socket_sendto().
2764          */
2765         return (isc_socket_sendto(sock, region, task, action, arg, NULL,
2766                                   NULL));
2767 }
2768
2769 isc_result_t
2770 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
2771                   isc_task_t *task, isc_taskaction_t action, const void *arg,
2772                   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2773 {
2774         isc_socketevent_t *dev;
2775         isc_socketmgr_t *manager;
2776
2777         REQUIRE(VALID_SOCKET(sock));
2778         REQUIRE(region != NULL);
2779         REQUIRE(task != NULL);
2780         REQUIRE(action != NULL);
2781
2782         manager = sock->manager;
2783         REQUIRE(VALID_MANAGER(manager));
2784
2785         INSIST(sock->bound);
2786
2787         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2788         if (dev == NULL) {
2789                 return (ISC_R_NOMEMORY);
2790         }
2791
2792         dev->region = *region;
2793
2794         return (socket_send(sock, dev, task, address, pktinfo, 0));
2795 }
2796
2797 isc_result_t
2798 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2799                  isc_task_t *task, isc_taskaction_t action, const void *arg)
2800 {
2801         return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
2802                                    NULL));
2803 }
2804
2805 isc_result_t
2806 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
2807                    isc_task_t *task, isc_taskaction_t action, const void *arg,
2808                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2809 {
2810         isc_socketevent_t *dev;
2811         isc_socketmgr_t *manager;
2812         unsigned int iocount;
2813         isc_buffer_t *buffer;
2814
2815         REQUIRE(VALID_SOCKET(sock));
2816         REQUIRE(buflist != NULL);
2817         REQUIRE(!ISC_LIST_EMPTY(*buflist));
2818         REQUIRE(task != NULL);
2819         REQUIRE(action != NULL);
2820
2821         manager = sock->manager;
2822         REQUIRE(VALID_MANAGER(manager));
2823
2824         iocount = isc_bufferlist_usedcount(buflist);
2825         REQUIRE(iocount > 0);
2826
2827         dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2828         if (dev == NULL) {
2829                 return (ISC_R_NOMEMORY);
2830         }
2831
2832         /*
2833          * Move each buffer from the passed in list to our internal one.
2834          */
2835         buffer = ISC_LIST_HEAD(*buflist);
2836         while (buffer != NULL) {
2837                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2838                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2839                 buffer = ISC_LIST_HEAD(*buflist);
2840         }
2841
2842         return (socket_send(sock, dev, task, address, pktinfo, 0));
2843 }
2844
2845 isc_result_t
2846 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
2847                    isc_task_t *task,
2848                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2849                    isc_socketevent_t *event, unsigned int flags)
2850 {
2851         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
2852         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
2853                 REQUIRE(sock->type == isc_sockettype_udp);
2854         event->ev_sender = sock;
2855         event->result = ISC_R_UNEXPECTED;
2856         ISC_LIST_INIT(event->bufferlist);
2857         event->region = *region;
2858         event->n = 0;
2859         event->offset = 0;
2860         event->attributes = 0;
2861
2862         return (socket_send(sock, event, task, address, pktinfo, flags));
2863 }
2864
2865 isc_result_t
2866 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr) {
2867         char strbuf[ISC_STRERRORSIZE];
2868         int on = 1;
2869
2870         LOCK(&sock->lock);
2871
2872         INSIST(!sock->bound);
2873
2874         if (sock->pf != sockaddr->type.sa.sa_family) {
2875                 UNLOCK(&sock->lock);
2876                 return (ISC_R_FAMILYMISMATCH);
2877         }
2878         /*
2879          * Only set SO_REUSEADDR when we want a specific port.
2880          */
2881         if (isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2882             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2883                        sizeof(on)) < 0) {
2884                 UNEXPECTED_ERROR(__FILE__, __LINE__,
2885                                  "setsockopt(%d) %s", sock->fd,
2886                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2887                                                 ISC_MSG_FAILED, "failed"));
2888                 /* Press on... */
2889         }
2890         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2891                 UNLOCK(&sock->lock);
2892                 switch (errno) {
2893                 case EACCES:
2894                         return (ISC_R_NOPERM);
2895                 case EADDRNOTAVAIL:
2896                         return (ISC_R_ADDRNOTAVAIL);
2897                 case EADDRINUSE:
2898                         return (ISC_R_ADDRINUSE);
2899                 case EINVAL:
2900                         return (ISC_R_BOUND);
2901                 default:
2902                         isc__strerror(errno, strbuf, sizeof(strbuf));
2903                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2904                                          strbuf);
2905                         return (ISC_R_UNEXPECTED);
2906                 }
2907         }
2908
2909         socket_log(sock, sockaddr, TRACE,
2910                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
2911         sock->bound = 1;
2912
2913         UNLOCK(&sock->lock);
2914         return (ISC_R_SUCCESS);
2915 }
2916
2917 isc_result_t
2918 isc_socket_filter(isc_socket_t *sock, const char *filter) {
2919 #ifdef SO_ACCEPTFILTER
2920         char strbuf[ISC_STRERRORSIZE];
2921         struct accept_filter_arg afa;
2922 #else
2923         UNUSED(sock);
2924         UNUSED(filter);
2925 #endif
2926
2927         REQUIRE(VALID_SOCKET(sock));
2928
2929 #ifdef SO_ACCEPTFILTER
2930         bzero(&afa, sizeof(afa));
2931         strncpy(afa.af_name, filter, sizeof(afa.af_name));
2932         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
2933                          &afa, sizeof(afa)) == -1) {
2934                 isc__strerror(errno, strbuf, sizeof(strbuf));
2935                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2936                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
2937                            strbuf);
2938                 return (ISC_R_FAILURE);
2939         }
2940         return (ISC_R_SUCCESS);
2941 #else
2942         return (ISC_R_NOTIMPLEMENTED);
2943 #endif
2944 }
2945
2946 /*
2947  * Set up to listen on a given socket.  We do this by creating an internal
2948  * event that will be dispatched when the socket has read activity.  The
2949  * watcher will send the internal event to the task when there is a new
2950  * connection.
2951  *
2952  * Unlike in read, we don't preallocate a done event here.  Every time there
2953  * is a new connection we'll have to allocate a new one anyway, so we might
2954  * as well keep things simple rather than having to track them.
2955  */
2956 isc_result_t
2957 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
2958         char strbuf[ISC_STRERRORSIZE];
2959
2960         REQUIRE(VALID_SOCKET(sock));
2961
2962         LOCK(&sock->lock);
2963
2964         REQUIRE(!sock->listener);
2965         REQUIRE(sock->bound);
2966         REQUIRE(sock->type == isc_sockettype_tcp);
2967
2968         if (backlog == 0)
2969                 backlog = SOMAXCONN;
2970
2971         if (listen(sock->fd, (int)backlog) < 0) {
2972                 UNLOCK(&sock->lock);
2973                 isc__strerror(errno, strbuf, sizeof(strbuf));
2974
2975                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
2976
2977                 return (ISC_R_UNEXPECTED);
2978         }
2979
2980         sock->listener = 1;
2981
2982         UNLOCK(&sock->lock);
2983         return (ISC_R_SUCCESS);
2984 }
2985
2986 /*
2987  * This should try to do agressive accept() XXXMLG
2988  */
2989 isc_result_t
2990 isc_socket_accept(isc_socket_t *sock,
2991                   isc_task_t *task, isc_taskaction_t action, const void *arg)
2992 {
2993         isc_socket_newconnev_t *dev;
2994         isc_socketmgr_t *manager;
2995         isc_task_t *ntask = NULL;
2996         isc_socket_t *nsock;
2997         isc_result_t ret;
2998         isc_boolean_t do_poke = ISC_FALSE;
2999
3000         REQUIRE(VALID_SOCKET(sock));
3001         manager = sock->manager;
3002         REQUIRE(VALID_MANAGER(manager));
3003
3004         LOCK(&sock->lock);
3005
3006         REQUIRE(sock->listener);
3007
3008         /*
3009          * Sender field is overloaded here with the task we will be sending
3010          * this event to.  Just before the actual event is delivered the
3011          * actual ev_sender will be touched up to be the socket.
3012          */
3013         dev = (isc_socket_newconnev_t *)
3014                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3015                                    action, arg, sizeof(*dev));
3016         if (dev == NULL) {
3017                 UNLOCK(&sock->lock);
3018                 return (ISC_R_NOMEMORY);
3019         }
3020         ISC_LINK_INIT(dev, ev_link);
3021
3022         ret = allocate_socket(manager, sock->type, &nsock);
3023         if (ret != ISC_R_SUCCESS) {
3024                 isc_event_free(ISC_EVENT_PTR(&dev));
3025                 UNLOCK(&sock->lock);
3026                 return (ret);
3027         }
3028
3029         /*
3030          * Attach to socket and to task.
3031          */
3032         isc_task_attach(task, &ntask);
3033         nsock->references++;
3034
3035         dev->ev_sender = ntask;
3036         dev->newsocket = nsock;
3037
3038         /*
3039          * Poke watcher here.  We still have the socket locked, so there
3040          * is no race condition.  We will keep the lock for such a short
3041          * bit of time waking it up now or later won't matter all that much.
3042          */
3043         if (ISC_LIST_EMPTY(sock->accept_list))
3044                 do_poke = ISC_TRUE;
3045
3046         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
3047
3048         if (do_poke)
3049                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
3050
3051         UNLOCK(&sock->lock);
3052         return (ISC_R_SUCCESS);
3053 }
3054
3055 isc_result_t
3056 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3057                    isc_task_t *task, isc_taskaction_t action, const void *arg)
3058 {
3059         isc_socket_connev_t *dev;
3060         isc_task_t *ntask = NULL;
3061         isc_socketmgr_t *manager;
3062         int cc;
3063         char strbuf[ISC_STRERRORSIZE];
3064
3065         REQUIRE(VALID_SOCKET(sock));
3066         REQUIRE(addr != NULL);
3067         REQUIRE(task != NULL);
3068         REQUIRE(action != NULL);
3069
3070         manager = sock->manager;
3071         REQUIRE(VALID_MANAGER(manager));
3072         REQUIRE(addr != NULL);
3073
3074         if (isc_sockaddr_ismulticast(addr))
3075                 return (ISC_R_MULTICAST);
3076
3077         LOCK(&sock->lock);
3078
3079         REQUIRE(!sock->connecting);
3080
3081         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3082                                                         ISC_SOCKEVENT_CONNECT,
3083                                                         action, arg,
3084                                                         sizeof(*dev));
3085         if (dev == NULL) {
3086                 UNLOCK(&sock->lock);
3087                 return (ISC_R_NOMEMORY);
3088         }
3089         ISC_LINK_INIT(dev, ev_link);
3090
3091         /*
3092          * Try to do the connect right away, as there can be only one
3093          * outstanding, and it might happen to complete.
3094          */
3095         sock->address = *addr;
3096         cc = connect(sock->fd, &addr->type.sa, addr->length);
3097         if (cc < 0) {
3098                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
3099                         goto queue;
3100
3101                 switch (errno) {
3102 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
3103                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
3104                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3105                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3106                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3107                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3108 #ifdef EHOSTDOWN
3109                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3110 #endif
3111                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3112                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3113                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3114                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3115                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3116 #undef ERROR_MATCH
3117                 }
3118
3119                 sock->connected = 0;
3120
3121                 isc__strerror(errno, strbuf, sizeof(strbuf));
3122                 UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
3123
3124                 UNLOCK(&sock->lock);
3125                 isc_event_free(ISC_EVENT_PTR(&dev));
3126                 return (ISC_R_UNEXPECTED);
3127
3128         err_exit:
3129                 sock->connected = 0;
3130                 isc_task_send(task, ISC_EVENT_PTR(&dev));
3131
3132                 UNLOCK(&sock->lock);
3133                 return (ISC_R_SUCCESS);
3134         }
3135
3136         /*
3137          * If connect completed, fire off the done event.
3138          */
3139         if (cc == 0) {
3140                 sock->connected = 1;
3141                 sock->bound = 1;
3142                 dev->result = ISC_R_SUCCESS;
3143                 isc_task_send(task, ISC_EVENT_PTR(&dev));
3144
3145                 UNLOCK(&sock->lock);
3146                 return (ISC_R_SUCCESS);
3147         }
3148
3149  queue:
3150
3151         /*
3152          * Attach to task.
3153          */
3154         isc_task_attach(task, &ntask);
3155
3156         sock->connecting = 1;
3157
3158         dev->ev_sender = ntask;
3159
3160         /*
3161          * Poke watcher here.  We still have the socket locked, so there
3162          * is no race condition.  We will keep the lock for such a short
3163          * bit of time waking it up now or later won't matter all that much.
3164          */
3165         if (sock->connect_ev == NULL)
3166                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
3167
3168         sock->connect_ev = dev;
3169
3170         UNLOCK(&sock->lock);
3171         return (ISC_R_SUCCESS);
3172 }
3173
3174 /*
3175  * Called when a socket with a pending connect() finishes.
3176  */
3177 static void
3178 internal_connect(isc_task_t *me, isc_event_t *ev) {
3179         isc_socket_t *sock;
3180         isc_socket_connev_t *dev;
3181         isc_task_t *task;
3182         int cc;
3183         ISC_SOCKADDR_LEN_T optlen;
3184         char strbuf[ISC_STRERRORSIZE];
3185         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3186
3187         UNUSED(me);
3188         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3189
3190         sock = ev->ev_sender;
3191         INSIST(VALID_SOCKET(sock));
3192
3193         LOCK(&sock->lock);
3194
3195         /*
3196          * When the internal event was sent the reference count was bumped
3197          * to keep the socket around for us.  Decrement the count here.
3198          */
3199         INSIST(sock->references > 0);
3200         sock->references--;
3201         if (sock->references == 0) {
3202                 UNLOCK(&sock->lock);
3203                 destroy(&sock);
3204                 return;
3205         }
3206
3207         /*
3208          * Has this event been canceled?
3209          */
3210         dev = sock->connect_ev;
3211         if (dev == NULL) {
3212                 INSIST(!sock->connecting);
3213                 UNLOCK(&sock->lock);
3214                 return;
3215         }
3216
3217         INSIST(sock->connecting);
3218         sock->connecting = 0;
3219
3220         /*
3221          * Get any possible error status here.
3222          */
3223         optlen = sizeof(cc);
3224         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
3225                        (void *)&cc, (void *)&optlen) < 0)
3226                 cc = errno;
3227         else
3228                 errno = cc;
3229
3230         if (errno != 0) {
3231                 /*
3232                  * If the error is EAGAIN, just re-select on this
3233                  * fd and pretend nothing strange happened.
3234                  */
3235                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
3236                         sock->connecting = 1;
3237                         select_poke(sock->manager, sock->fd,
3238                                     SELECT_POKE_CONNECT);
3239                         UNLOCK(&sock->lock);
3240
3241                         return;
3242                 }
3243
3244                 /*
3245                  * Translate other errors into ISC_R_* flavors.
3246                  */
3247                 switch (errno) {
3248 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
3249                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
3250                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3251                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3252                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3253                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3254 #ifdef EHOSTDOWN
3255                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3256 #endif
3257                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3258                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3259                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3260                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3261                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
3262                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
3263 #undef ERROR_MATCH
3264                 default:
3265                         dev->result = ISC_R_UNEXPECTED;
3266                         isc_sockaddr_format(&sock->address, peerbuf,
3267                                             sizeof(peerbuf));
3268                         isc__strerror(errno, strbuf, sizeof(strbuf));
3269                         UNEXPECTED_ERROR(__FILE__, __LINE__,
3270                                          "internal_connect: connect(%s) %s",
3271                                          peerbuf, strbuf);
3272                 }
3273         } else {
3274                 dev->result = ISC_R_SUCCESS;
3275                 sock->connected = 1;
3276                 sock->bound = 1;
3277         }
3278
3279         sock->connect_ev = NULL;
3280
3281         UNLOCK(&sock->lock);
3282
3283         task = dev->ev_sender;
3284         dev->ev_sender = sock;
3285         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3286 }
3287
3288 isc_result_t
3289 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3290         isc_result_t ret;
3291
3292         REQUIRE(VALID_SOCKET(sock));
3293         REQUIRE(addressp != NULL);
3294
3295         LOCK(&sock->lock);
3296
3297         if (sock->connected) {
3298                 *addressp = sock->address;
3299                 ret = ISC_R_SUCCESS;
3300         } else {
3301                 ret = ISC_R_NOTCONNECTED;
3302         }
3303
3304         UNLOCK(&sock->lock);
3305
3306         return (ret);
3307 }
3308
3309 isc_result_t
3310 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3311         ISC_SOCKADDR_LEN_T len;
3312         isc_result_t ret;
3313         char strbuf[ISC_STRERRORSIZE];
3314
3315         REQUIRE(VALID_SOCKET(sock));
3316         REQUIRE(addressp != NULL);
3317
3318         LOCK(&sock->lock);
3319
3320         if (!sock->bound) {
3321                 ret = ISC_R_NOTBOUND;
3322                 goto out;
3323         }
3324
3325         ret = ISC_R_SUCCESS;
3326
3327         len = sizeof(addressp->type);
3328         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3329                 isc__strerror(errno, strbuf, sizeof(strbuf));
3330                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3331                                  strbuf);
3332                 ret = ISC_R_UNEXPECTED;
3333                 goto out;
3334         }
3335         addressp->length = (unsigned int)len;
3336
3337  out:
3338         UNLOCK(&sock->lock);
3339
3340         return (ret);
3341 }
3342
3343 /*
3344  * Run through the list of events on this socket, and cancel the ones
3345  * queued for task "task" of type "how".  "how" is a bitmask.
3346  */
3347 void
3348 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3349
3350         REQUIRE(VALID_SOCKET(sock));
3351
3352         /*
3353          * Quick exit if there is nothing to do.  Don't even bother locking
3354          * in this case.
3355          */
3356         if (how == 0)
3357                 return;
3358
3359         LOCK(&sock->lock);
3360
3361         /*
3362          * All of these do the same thing, more or less.
3363          * Each will:
3364          *      o If the internal event is marked as "posted" try to
3365          *        remove it from the task's queue.  If this fails, mark it
3366          *        as canceled instead, and let the task clean it up later.
3367          *      o For each I/O request for that task of that type, post
3368          *        its done event with status of "ISC_R_CANCELED".
3369          *      o Reset any state needed.
3370          */
3371         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
3372             && !ISC_LIST_EMPTY(sock->recv_list)) {
3373                 isc_socketevent_t      *dev;
3374                 isc_socketevent_t      *next;
3375                 isc_task_t             *current_task;
3376
3377                 dev = ISC_LIST_HEAD(sock->recv_list);
3378
3379                 while (dev != NULL) {
3380                         current_task = dev->ev_sender;
3381                         next = ISC_LIST_NEXT(dev, ev_link);
3382
3383                         if ((task == NULL) || (task == current_task)) {
3384                                 dev->result = ISC_R_CANCELED;
3385                                 send_recvdone_event(sock, &dev);
3386                         }
3387                         dev = next;
3388                 }
3389         }
3390
3391         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
3392             && !ISC_LIST_EMPTY(sock->send_list)) {
3393                 isc_socketevent_t      *dev;
3394                 isc_socketevent_t      *next;
3395                 isc_task_t             *current_task;
3396
3397                 dev = ISC_LIST_HEAD(sock->send_list);
3398
3399                 while (dev != NULL) {
3400                         current_task = dev->ev_sender;
3401                         next = ISC_LIST_NEXT(dev, ev_link);
3402
3403                         if ((task == NULL) || (task == current_task)) {
3404                                 dev->result = ISC_R_CANCELED;
3405                                 send_senddone_event(sock, &dev);
3406                         }
3407                         dev = next;
3408                 }
3409         }
3410
3411         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3412             && !ISC_LIST_EMPTY(sock->accept_list)) {
3413                 isc_socket_newconnev_t *dev;
3414                 isc_socket_newconnev_t *next;
3415                 isc_task_t             *current_task;
3416
3417                 dev = ISC_LIST_HEAD(sock->accept_list);
3418                 while (dev != NULL) {
3419                         current_task = dev->ev_sender;
3420                         next = ISC_LIST_NEXT(dev, ev_link);
3421
3422                         if ((task == NULL) || (task == current_task)) {
3423
3424                                 ISC_LIST_UNLINK(sock->accept_list, dev,
3425                                                 ev_link);
3426
3427                                 dev->newsocket->references--;
3428                                 free_socket(&dev->newsocket);
3429
3430                                 dev->result = ISC_R_CANCELED;
3431                                 dev->ev_sender = sock;
3432                                 isc_task_sendanddetach(&current_task,
3433                                                        ISC_EVENT_PTR(&dev));
3434                         }
3435
3436                         dev = next;
3437                 }
3438         }
3439
3440         /*
3441          * Connecting is not a list.
3442          */
3443         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3444             && sock->connect_ev != NULL) {
3445                 isc_socket_connev_t    *dev;
3446                 isc_task_t             *current_task;
3447
3448                 INSIST(sock->connecting);
3449                 sock->connecting = 0;
3450
3451                 dev = sock->connect_ev;
3452                 current_task = dev->ev_sender;
3453
3454                 if ((task == NULL) || (task == current_task)) {
3455                         sock->connect_ev = NULL;
3456
3457                         dev->result = ISC_R_CANCELED;
3458                         dev->ev_sender = sock;
3459                         isc_task_sendanddetach(&current_task,
3460                                                ISC_EVENT_PTR(&dev));
3461                 }
3462         }
3463
3464         UNLOCK(&sock->lock);
3465 }
3466
3467 isc_sockettype_t
3468 isc_socket_gettype(isc_socket_t *sock) {
3469         REQUIRE(VALID_SOCKET(sock));
3470
3471         return (sock->type);
3472 }
3473
3474 isc_boolean_t
3475 isc_socket_isbound(isc_socket_t *sock) {
3476         isc_boolean_t val;
3477
3478         LOCK(&sock->lock);
3479         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3480         UNLOCK(&sock->lock);
3481
3482         return (val);
3483 }
3484
3485 void
3486 isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3487 #if defined(IPV6_V6ONLY)
3488         int onoff = yes ? 1 : 0;
3489 #else
3490         UNUSED(yes);
3491         UNUSED(sock);
3492 #endif
3493
3494         REQUIRE(VALID_SOCKET(sock));
3495
3496 #ifdef IPV6_V6ONLY
3497         if (sock->pf == AF_INET6) {
3498                 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3499                                  (void *)&onoff, sizeof(onoff));
3500         }
3501 #endif
3502 }
3503
3504 #ifndef ISC_PLATFORM_USETHREADS
3505 void
3506 isc__socketmgr_getfdsets(fd_set *readset, fd_set *writeset, int *maxfd) {
3507         if (socketmgr == NULL)
3508                 *maxfd = 0;
3509         else {
3510                 *readset = socketmgr->read_fds;
3511                 *writeset = socketmgr->write_fds;
3512                 *maxfd = socketmgr->maxfd + 1;
3513         }
3514 }
3515
3516 isc_result_t
3517 isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) {
3518         isc_socketmgr_t *manager = socketmgr;
3519
3520         if (manager == NULL)
3521                 return (ISC_R_NOTFOUND);
3522
3523         process_fds(manager, maxfd, readset, writeset);
3524         return (ISC_R_SUCCESS);
3525 }
3526 #endif /* ISC_PLATFORM_USETHREADS */