kernel - Add callout debugging
[dragonfly.git] / sys / kern / kern_dmsg.c
1 /*-
2  * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * TODO: txcmd CREATE state is deferred by tx msgq, need to calculate
36  *       a streaming response.  See subr_diskiocom()'s diskiodone().
37  */
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/sysctl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/priv.h>
53 #include <sys/thread.h>
54 #include <sys/globaldata.h>
55 #include <sys/limits.h>
56
57 #include <sys/dmsg.h>
58
59 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
60
61 SYSCTL_NODE(, OID_AUTO, kdmsg, CTLFLAG_RW, 0, "kdmsg");
62 static int kdmsg_debug = 1;
63 SYSCTL_INT(_kdmsg, OID_AUTO, debug, CTLFLAG_RW, &kdmsg_debug, 0,
64            "Set debug level for kernel dmsg layer");
65
66 #define kd_printf(level, ctl, ...)              \
67         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
68
69 #define kdio_printf(iocom, level, ctl, ...)      \
70         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
71
72 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
73 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
74 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
75 static void kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg);
76 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
77 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
78 static void kdmsg_subq_delete(kdmsg_state_t *state);
79 static void kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error);
80 static void kdmsg_state_abort(kdmsg_state_t *state);
81 static void kdmsg_state_dying(kdmsg_state_t *state);
82 static void kdmsg_state_free(kdmsg_state_t *state);
83 static void kdmsg_drain_msg(kdmsg_msg_t *msg);
84
85 #ifdef KDMSG_DEBUG
86 #define KDMSG_DEBUG_ARGS        , const char *file, int line
87 #define kdmsg_state_hold(state) _kdmsg_state_hold(state, __FILE__, __LINE__)
88 #define kdmsg_state_drop(state) _kdmsg_state_drop(state, __FILE__, __LINE__)
89 #else
90 #define KDMSG_DEBUG 0
91 #define KDMSG_DEBUG_ARGS
92 #define kdmsg_state_hold(state) _kdmsg_state_hold(state)
93 #define kdmsg_state_drop(state) _kdmsg_state_drop(state)
94 #endif
95 static void _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
96 static void _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
97
98 static void kdmsg_iocom_thread_rd(void *arg);
99 static void kdmsg_iocom_thread_wr(void *arg);
100 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
101
102 /*static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);*/
103
104 /*
105  * Initialize the roll-up communications structure for a network
106  * messaging session.  This function does not install the socket.
107  */
108 void
109 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
110                  struct malloc_type *mmsg,
111                  int (*rcvmsg)(kdmsg_msg_t *msg))
112 {
113         bzero(iocom, sizeof(*iocom));
114         iocom->handle = handle;
115         iocom->mmsg = mmsg;
116         iocom->rcvmsg = rcvmsg;
117         iocom->flags = flags;
118         lockinit(&iocom->msglk, "h2msg", 0, 0);
119         TAILQ_INIT(&iocom->msgq);
120         RB_INIT(&iocom->staterd_tree);
121         RB_INIT(&iocom->statewr_tree);
122
123         iocom->state0.iocom = iocom;
124         iocom->state0.parent = &iocom->state0;
125         TAILQ_INIT(&iocom->state0.subq);
126 }
127
128 /*
129  * [Re]connect using the passed file pointer.  The caller must ref the
130  * fp for us.  We own that ref now.
131  */
132 void
133 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
134                       const char *subsysname)
135 {
136         /*
137          * Destroy the current connection
138          */
139         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
140         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
141         while (iocom->msgrd_td || iocom->msgwr_td) {
142                 wakeup(&iocom->msg_ctl);
143                 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
144         }
145
146         /*
147          * Drop communications descriptor
148          */
149         if (iocom->msg_fp) {
150                 fdrop(iocom->msg_fp);
151                 iocom->msg_fp = NULL;
152         }
153
154         /*
155          * Setup new communications descriptor
156          */
157         iocom->msg_ctl = 0;
158         iocom->msg_fp = fp;
159         iocom->msg_seq = 0;
160         iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
161
162         lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
163                     NULL, 0, -1, "%s-msgrd", subsysname);
164         lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
165                     NULL, 0, -1, "%s-msgwr", subsysname);
166         lockmgr(&iocom->msglk, LK_RELEASE);
167 }
168
169 /*
170  * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
171  * this function to handle the state machine for LNK_CONN and LNK_SPAN.
172  */
173 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
174 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
175
176 void
177 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
178                          void (*auto_callback)(kdmsg_msg_t *msg))
179 {
180         kdmsg_msg_t *msg;
181
182         iocom->auto_callback = auto_callback;
183
184         msg = kdmsg_msg_alloc(&iocom->state0,
185                               DMSG_LNK_CONN | DMSGF_CREATE,
186                               kdmsg_lnk_conn_reply, NULL);
187         iocom->auto_lnk_conn.head = msg->any.head;
188         msg->any.lnk_conn = iocom->auto_lnk_conn;
189         iocom->conn_state = msg->state;
190         kdmsg_state_hold(msg->state);   /* iocom->conn_state */
191         kdmsg_msg_write(msg);
192 }
193
194 static
195 int
196 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
197 {
198         kdmsg_iocom_t *iocom = state->iocom;
199         kdmsg_msg_t *rmsg;
200
201         /*
202          * Upon receipt of the LNK_CONN acknowledgement initiate an
203          * automatic SPAN if we were asked to.  Used by e.g. xdisk, but
204          * not used by HAMMER2 which must manage more than one transmitted
205          * SPAN.
206          */
207         if ((msg->any.head.cmd & DMSGF_CREATE) &&
208             (iocom->flags & KDMSG_IOCOMF_AUTOTXSPAN)) {
209                 rmsg = kdmsg_msg_alloc(&iocom->state0,
210                                        DMSG_LNK_SPAN | DMSGF_CREATE,
211                                        kdmsg_lnk_span_reply, NULL);
212                 iocom->auto_lnk_span.head = rmsg->any.head;
213                 rmsg->any.lnk_span = iocom->auto_lnk_span;
214                 kdmsg_msg_write(rmsg);
215         }
216
217         /*
218          * Process shim after the CONN is acknowledged and before the CONN
219          * transaction is deleted.  For deletions this gives device drivers
220          * the ability to interlock new operations on the circuit before
221          * it becomes illegal and panics.
222          */
223         if (iocom->auto_callback)
224                 iocom->auto_callback(msg);
225
226         if ((state->txcmd & DMSGF_DELETE) == 0 &&
227             (msg->any.head.cmd & DMSGF_DELETE)) {
228                 /*
229                  * iocom->conn_state has a state ref, drop it when clearing.
230                  */
231                 if (iocom->conn_state)
232                         kdmsg_state_drop(iocom->conn_state);
233                 iocom->conn_state = NULL;
234                 kdmsg_msg_reply(msg, 0);
235         }
236
237         return (0);
238 }
239
240 static
241 int
242 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
243 {
244         /*
245          * Be sure to process shim before terminating the SPAN
246          * transaction.  Gives device drivers the ability to
247          * interlock new operations on the circuit before it
248          * becomes illegal and panics.
249          */
250         if (state->iocom->auto_callback)
251                 state->iocom->auto_callback(msg);
252
253         if ((state->txcmd & DMSGF_DELETE) == 0 &&
254             (msg->any.head.cmd & DMSGF_DELETE)) {
255                 kdmsg_msg_reply(msg, 0);
256         }
257         return (0);
258 }
259
260 /*
261  * Disconnect and clean up
262  */
263 void
264 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
265 {
266         kdmsg_state_t *state;
267         kdmsg_msg_t *msg;
268         int retries;
269
270         /*
271          * Ask the cluster controller to go away by setting
272          * KILLRX.  Send a PING to get a response to unstick reading
273          * from the pipe.
274          *
275          * After 10 seconds shitcan the pipe and do an unclean shutdown.
276          */
277         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
278
279         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
280         msg = kdmsg_msg_alloc(&iocom->state0, DMSG_LNK_PING, NULL, NULL);
281         kdmsg_msg_write_locked(iocom, msg);
282
283         retries = 10;
284         while (iocom->msgrd_td || iocom->msgwr_td) {
285                 wakeup(&iocom->msg_ctl);
286                 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
287                 if (--retries == 0 && iocom->msg_fp) {
288                         kdio_printf(iocom, 0, "%s\n",
289                                     "iocom_uninit: "
290                                     "shitcanning unresponsive pipe");
291                         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
292                         /* retries allowed to go negative, keep looping */
293                 }
294         }
295
296         /*
297          * Cleanup caches
298          */
299         if ((state = iocom->freerd_state) != NULL) {
300                 iocom->freerd_state = NULL;
301                 kdmsg_state_drop(state);
302         }
303
304         if ((state = iocom->freewr_state) != NULL) {
305                 iocom->freewr_state = NULL;
306                 kdmsg_state_drop(state);
307         }
308
309         /*
310          * Drop communications descriptor
311          */
312         if (iocom->msg_fp) {
313                 fdrop(iocom->msg_fp);
314                 iocom->msg_fp = NULL;
315         }
316         lockmgr(&iocom->msglk, LK_RELEASE);
317 }
318
319 /*
320  * Cluster controller thread.  Perform messaging functions.  We have one
321  * thread for the reader and one for the writer.  The writer handles
322  * shutdown requests (which should break the reader thread).
323  */
324 static
325 void
326 kdmsg_iocom_thread_rd(void *arg)
327 {
328         kdmsg_iocom_t *iocom = arg;
329         dmsg_hdr_t hdr;
330         kdmsg_msg_t *msg = NULL;
331         size_t hbytes;
332         size_t abytes;
333         int error = 0;
334
335         while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLRX) == 0) {
336                 /*
337                  * Retrieve the message from the pipe or socket.
338                  */
339                 error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
340                                 NULL, 1, UIO_SYSSPACE);
341                 if (error)
342                         break;
343                 if (hdr.magic != DMSG_HDR_MAGIC) {
344                         kdio_printf(iocom, 1, "bad magic: %04x\n", hdr.magic);
345                         error = EINVAL;
346                         break;
347                 }
348                 hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
349                 if (hbytes < sizeof(hdr) || hbytes > DMSG_HDR_MAX) {
350                         kdio_printf(iocom, 1, "bad header size %zd\n", hbytes);
351                         error = EINVAL;
352                         break;
353                 }
354
355                 /* XXX messy: mask cmd to avoid allocating state */
356                 msg = kdmsg_msg_alloc(&iocom->state0,
357                                       hdr.cmd & DMSGF_BASECMDMASK,
358                                       NULL, NULL);
359                 msg->any.head = hdr;
360                 msg->hdr_size = hbytes;
361                 if (hbytes > sizeof(hdr)) {
362                         error = fp_read(iocom->msg_fp, &msg->any.head + 1,
363                                         hbytes - sizeof(hdr),
364                                         NULL, 1, UIO_SYSSPACE);
365                         if (error) {
366                                 kdio_printf(iocom, 1, "%s\n",
367                                             "short msg received");
368                                 error = EINVAL;
369                                 break;
370                         }
371                 }
372                 msg->aux_size = hdr.aux_bytes;
373                 if (msg->aux_size > DMSG_AUX_MAX) {
374                         kdio_printf(iocom, 1,
375                                     "illegal msg payload size %zd\n",
376                                     msg->aux_size);
377                         error = EINVAL;
378                         break;
379                 }
380                 if (msg->aux_size) {
381                         abytes = DMSG_DOALIGN(msg->aux_size);
382                         msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
383                         msg->flags |= KDMSG_FLAG_AUXALLOC;
384                         error = fp_read(iocom->msg_fp, msg->aux_data,
385                                         abytes, NULL, 1, UIO_SYSSPACE);
386                         if (error) {
387                                 kdio_printf(iocom, 1, "%s\n",
388                                             "short msg payload received");
389                                 break;
390                         }
391                 }
392
393                 error = kdmsg_msg_receive_handling(msg);
394                 msg = NULL;
395         }
396
397 #if 0
398         kdio_printf(iocom, 1, "read thread terminating error=%d\n", error);
399 #endif
400
401         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
402         if (msg)
403                 kdmsg_msg_free(msg);
404
405         /*
406          * Shutdown the socket and set KILLRX for consistency in case the
407          * shutdown was not commanded.  Signal the transmit side to shutdown
408          * by setting KILLTX and waking it up.
409          */
410         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
411         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
412                                         KDMSG_CLUSTERCTL_KILLTX);
413         iocom->msgrd_td = NULL;
414         lockmgr(&iocom->msglk, LK_RELEASE);
415         wakeup(&iocom->msg_ctl);
416
417         /*
418          * iocom can be ripped out at any time once the lock is
419          * released with msgrd_td set to NULL.  The wakeup()s are safe but
420          * that is all.
421          */
422         wakeup(iocom);
423         lwkt_exit();
424 }
425
426 static
427 void
428 kdmsg_iocom_thread_wr(void *arg)
429 {
430         kdmsg_iocom_t *iocom = arg;
431         kdmsg_msg_t *msg;
432         ssize_t res;
433         size_t abytes;
434         int error = 0;
435         int save_ticks;
436         int didwarn;
437
438         /*
439          * Transmit loop
440          */
441         msg = NULL;
442         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
443
444         while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0 && error == 0) {
445                 /*
446                  * Sleep if no messages pending.  Interlock with flag while
447                  * holding msglk.
448                  */
449                 if (TAILQ_EMPTY(&iocom->msgq)) {
450                         atomic_set_int(&iocom->msg_ctl,
451                                        KDMSG_CLUSTERCTL_SLEEPING);
452                         lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
453                         atomic_clear_int(&iocom->msg_ctl,
454                                          KDMSG_CLUSTERCTL_SLEEPING);
455                 }
456
457                 while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
458                         /*
459                          * Remove msg from the transmit queue and do
460                          * persist and half-closed state handling.
461                          */
462                         TAILQ_REMOVE(&iocom->msgq, msg, qentry);
463
464                         error = kdmsg_state_msgtx(msg);
465                         if (error == EALREADY) {
466                                 error = 0;
467                                 kdmsg_msg_free(msg);
468                                 continue;
469                         }
470                         if (error) {
471                                 kdmsg_msg_free(msg);
472                                 break;
473                         }
474
475                         /*
476                          * Dump the message to the pipe or socket.
477                          *
478                          * We have to clean up the message as if the transmit
479                          * succeeded even if it failed.
480                          */
481                         lockmgr(&iocom->msglk, LK_RELEASE);
482                         error = fp_write(iocom->msg_fp, &msg->any,
483                                          msg->hdr_size, &res, UIO_SYSSPACE);
484                         if (error || res != msg->hdr_size) {
485                                 if (error == 0)
486                                         error = EINVAL;
487                                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
488                                 kdmsg_state_cleanuptx(msg);
489                                 break;
490                         }
491                         if (msg->aux_size) {
492                                 abytes = DMSG_DOALIGN(msg->aux_size);
493                                 error = fp_write(iocom->msg_fp,
494                                                  msg->aux_data, abytes,
495                                                  &res, UIO_SYSSPACE);
496                                 if (error || res != abytes) {
497                                         if (error == 0)
498                                                 error = EINVAL;
499                                         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
500                                         kdmsg_state_cleanuptx(msg);
501                                         break;
502                                 }
503                         }
504                         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
505                         kdmsg_state_cleanuptx(msg);
506                 }
507         }
508
509 #if 0
510         kdio_printf(iocom, 1, "write thread terminating error=%d\n", error);
511 #endif
512
513         /*
514          * Shutdown the socket and set KILLTX for consistency in case the
515          * shutdown was not commanded.  Signal the receive side to shutdown
516          * by setting KILLRX and waking it up.
517          */
518         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
519         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
520                                         KDMSG_CLUSTERCTL_KILLTX);
521         wakeup(&iocom->msg_ctl);
522
523         /*
524          * The transmit thread is responsible for final cleanups, wait
525          * for the receive side to terminate to prevent new received
526          * states from interfering with our cleanup.
527          *
528          * Do not set msgwr_td to NULL until we actually exit.
529          */
530         while (iocom->msgrd_td) {
531                 wakeup(&iocom->msg_ctl);
532                 lksleep(iocom, &iocom->msglk, 0, "clstrkt", hz);
533         }
534
535         /*
536          * We can no longer receive new messages.  We must drain the transmit
537          * message queue and simulate received messages to close anay remaining
538          * states.
539          *
540          * Loop until all the states are gone and there are no messages
541          * pending transmit.
542          */
543         save_ticks = ticks;
544         didwarn = 0;
545         iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
546
547         while (TAILQ_FIRST(&iocom->msgq) ||
548                RB_ROOT(&iocom->staterd_tree) ||
549                RB_ROOT(&iocom->statewr_tree) ||
550                iocom->conn_state) {
551                 /*
552                  * Simulate failure for all sub-states of state0.
553                  */
554                 kdmsg_drain_msgq(iocom);
555                 kdmsg_simulate_failure(&iocom->state0, 0, DMSG_ERR_LOSTLINK);
556
557                 lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz / 2);
558
559                 if ((int)(ticks - save_ticks) > hz*2 && didwarn == 0) {
560                         didwarn = 1;
561                         kdio_printf(iocom, 0,
562                                     "Warning, write thread on %p "
563                                     "still terminating\n",
564                                     iocom);
565                 }
566                 if ((int)(ticks - save_ticks) > hz*15 && didwarn == 1) {
567                         didwarn = 2;
568                         kdio_printf(iocom, 0,
569                                     "Warning, write thread on %p "
570                                     "still terminating\n",
571                                     iocom);
572                 }
573                 if ((int)(ticks - save_ticks) > hz*60) {
574                         kdio_printf(iocom, 0,
575                                     "Can't terminate: msgq %p "
576                                     "rd_tree %p wr_tree %p\n",
577                                     TAILQ_FIRST(&iocom->msgq),
578                                     RB_ROOT(&iocom->staterd_tree),
579                                     RB_ROOT(&iocom->statewr_tree));
580                         lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz * 10);
581                 }
582         }
583
584         /*
585          * Exit handling is done by the write thread.
586          */
587         lockmgr(&iocom->msglk, LK_RELEASE);
588
589         /*
590          * The state trees had better be empty now
591          */
592         KKASSERT(RB_EMPTY(&iocom->staterd_tree));
593         KKASSERT(RB_EMPTY(&iocom->statewr_tree));
594         KKASSERT(iocom->conn_state == NULL);
595
596         if (iocom->exit_func) {
597                 /*
598                  * iocom is invalid after we call the exit function.
599                  */
600                 iocom->msgwr_td = NULL;
601                 iocom->exit_func(iocom);
602         } else {
603                 /*
604                  * iocom can be ripped out from under us once msgwr_td is
605                  * set to NULL.  The wakeup is safe.
606                  */
607                 iocom->msgwr_td = NULL;
608                 wakeup(iocom);
609         }
610         lwkt_exit();
611 }
612
613 /*
614  * This cleans out the pending transmit message queue, adjusting any
615  * persistent states properly in the process.
616  *
617  * Called with iocom locked.
618  */
619 void
620 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
621 {
622         kdmsg_msg_t *msg;
623
624         /*
625          * Clean out our pending transmit queue, executing the
626          * appropriate state adjustments as if the messages were
627          * sent.
628          */
629         while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
630                 TAILQ_REMOVE(&iocom->msgq, msg, qentry);
631                 kdmsg_drain_msg(msg);
632         }
633 }
634
635 /*
636  * Drain one message by simulating transmission and also simulating a
637  * receive failure.
638  */
639 static void
640 kdmsg_drain_msg(kdmsg_msg_t *msg)
641 {
642         if (kdmsg_state_msgtx(msg)) {
643                 kdmsg_msg_free(msg);
644         } else {
645                 if (msg->state) {
646                         kdmsg_simulate_failure(msg->state,
647                                                0, DMSG_ERR_LOSTLINK);
648                 }
649                 kdmsg_state_cleanuptx(msg);
650         }
651 }
652
653 /*
654  * Do all processing required to handle a freshly received message
655  * after its low level header has been validated.
656  *
657  * iocom is not locked.
658  */
659 static
660 int
661 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
662 {
663         kdmsg_iocom_t *iocom = msg->state->iocom;
664         int error;
665
666         /*
667          * State machine tracking, state assignment for msg,
668          * returns error and discard status.  Errors are fatal
669          * to the connection except for EALREADY which forces
670          * a discard without execution.
671          */
672         error = kdmsg_state_msgrx(msg);
673         if (msg->state->flags & KDMSG_STATE_ABORTING) {
674                 kdio_printf(iocom, 5,
675                             "kdmsg_state_abort(b): state %p rxcmd=%08x "
676                             "txcmd=%08x msgrx error %d\n",
677                             msg->state, msg->state->rxcmd,
678                             msg->state->txcmd, error);
679         }
680         if (error) {
681                 /*
682                  * Raw protocol or connection error
683                  */
684                 if (msg->state->flags & KDMSG_STATE_ABORTING)
685                         kdio_printf(iocom, 5,
686                                     "X1 state %p error %d\n",
687                                     msg->state, error);
688                 kdmsg_msg_free(msg);
689                 if (error == EALREADY)
690                         error = 0;
691         } else if (msg->state && msg->state->func) {
692                 /*
693                  * Message related to state which already has a
694                  * handling function installed for it.
695                  */
696                 if (msg->state->flags & KDMSG_STATE_ABORTING)
697                         kdio_printf(iocom, 5,
698                                     "X2 state %p func %p\n",
699                                     msg->state, msg->state->func);
700                 error = msg->state->func(msg->state, msg);
701                 kdmsg_state_cleanuprx(msg);
702         } else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
703                 if (msg->state->flags & KDMSG_STATE_ABORTING)
704                         kdio_printf(iocom, 5,
705                                     "X3 state %p\n", msg->state);
706                 error = kdmsg_autorxmsg(msg);
707                 kdmsg_state_cleanuprx(msg);
708         } else {
709                 if (msg->state->flags & KDMSG_STATE_ABORTING)
710                         kdio_printf(iocom, 5,
711                                     "X4 state %p\n", msg->state);
712                 error = iocom->rcvmsg(msg);
713                 kdmsg_state_cleanuprx(msg);
714         }
715         return error;
716 }
717
718 /*
719  * Process state tracking for a message after reception and dequeueing,
720  * prior to execution of the state callback.  The state is updated and
721  * will be removed from the RBTREE if completely closed, but the state->parent
722  * and subq linkage is not cleaned up until after the callback (see
723  * cleanuprx()).
724  *
725  * msglk is not held.
726  *
727  * NOTE: A message transaction can consist of several messages in either
728  *       direction.
729  *
730  * NOTE: The msgid is unique to the initiator, not necessarily unique for
731  *       us or for any relay or for the return direction for that matter.
732  *       That is, two sides sending a new message can use the same msgid
733  *       without colliding.
734  *
735  * --
736  *
737  * ABORT sequences work by setting the ABORT flag along with normal message
738  * state.  However, ABORTs can also be sent on half-closed messages, that is
739  * even if the command or reply side has already sent a DELETE, as long as
740  * the message has not been fully closed it can still send an ABORT+DELETE
741  * to terminate the half-closed message state.
742  *
743  * Since ABORT+DELETEs can race we silently discard ABORT's for message
744  * state which has already been fully closed.  REPLY+ABORT+DELETEs can
745  * also race, and in this situation the other side might have already
746  * initiated a new unrelated command with the same message id.  Since
747  * the abort has not set the CREATE flag the situation can be detected
748   * and the message will also be discarded.
749  *
750  * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
751  * The ABORT request is essentially integrated into the command instead
752  * of being sent later on.  In this situation the command implementation
753  * detects that CREATE and ABORT are both set (vs ABORT alone) and can
754  * special-case non-blocking operation for the command.
755  *
756  * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
757  *        to be mid-stream aborts for command/reply sequences.  ABORTs on
758  *        one-way messages are not supported.
759  *
760  * NOTE!  If a command sequence does not support aborts the ABORT flag is
761  *        simply ignored.
762  *
763  * --
764  *
765  * One-off messages (no reply expected) are sent with neither CREATE or DELETE
766  * set.  One-off messages cannot be aborted and typically aren't processed
767  * by these routines.  The REPLY bit can be used to distinguish whether a
768  * one-off message is a command or reply.  For example, one-off replies
769  * will typically just contain status updates.
770  */
771 static
772 int
773 kdmsg_state_msgrx(kdmsg_msg_t *msg)
774 {
775         kdmsg_iocom_t *iocom = msg->state->iocom;
776         kdmsg_state_t *state;
777         kdmsg_state_t *pstate;
778         kdmsg_state_t sdummy;
779         int error;
780
781         bzero(&sdummy, sizeof(sdummy)); /* avoid gcc warnings */
782
783         /*
784          * Make sure a state structure is ready to go in case we need a new
785          * one.  This is the only routine which uses freerd_state so no
786          * races are possible.
787          */
788         if ((state = iocom->freerd_state) == NULL) {
789                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
790                 state->flags = KDMSG_STATE_DYNAMIC;
791                 state->iocom = iocom;
792                 state->refs = 1;
793                 TAILQ_INIT(&state->subq);
794                 iocom->freerd_state = state;
795         }
796         state = NULL;   /* safety */
797
798         /*
799          * Lock RB tree and locate existing persistent state, if any.
800          *
801          * If received msg is a command state is on staterd_tree.
802          * If received msg is a reply state is on statewr_tree.
803          */
804         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
805
806 again:
807         if (msg->state == &iocom->state0) {
808                 sdummy.msgid = msg->any.head.msgid;
809                 sdummy.iocom = iocom;
810                 if (msg->any.head.cmd & DMSGF_REVTRANS) {
811                         state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree,
812                                         &sdummy);
813                 } else {
814                         state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree,
815                                         &sdummy);
816                 }
817
818                 /*
819                  * Set message state unconditionally.  If this is a CREATE
820                  * message this state will become the parent state and new
821                  * state will be allocated for the message state.
822                  */
823                 if (state == NULL)
824                         state = &iocom->state0;
825                 if (state->flags & KDMSG_STATE_INTERLOCK) {
826                         state->flags |= KDMSG_STATE_SIGNAL;
827                         lksleep(state, &iocom->msglk, 0, "dmrace", hz);
828                         goto again;
829                 }
830                 kdmsg_state_hold(state);
831                 kdmsg_state_drop(msg->state);   /* iocom->state0 */
832                 msg->state = state;
833         } else {
834                 state = msg->state;
835         }
836
837         /*
838          * Short-cut one-off or mid-stream messages.
839          */
840         if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
841                                   DMSGF_ABORT)) == 0) {
842                 error = 0;
843                 goto done;
844         }
845
846         /*
847          * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
848          * inside the case statements.
849          */
850         switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
851         case DMSGF_CREATE:
852         case DMSGF_CREATE | DMSGF_DELETE:
853                 /*
854                  * New persistant command received.
855                  */
856                 if (state != &iocom->state0) {
857                         kdio_printf(iocom, 1, "%s\n",
858                                     "duplicate transaction");
859                         error = EINVAL;
860                         break;
861                 }
862
863                 /*
864                  * Lookup the circuit.  The circuit is an open transaction.
865                  * the REVCIRC bit in the message tells us which side
866                  * initiated the transaction representing the circuit.
867                  */
868                 if (msg->any.head.circuit) {
869                         sdummy.msgid = msg->any.head.circuit;
870
871                         if (msg->any.head.cmd & DMSGF_REVCIRC) {
872                                 pstate = RB_FIND(kdmsg_state_tree,
873                                                  &iocom->statewr_tree,
874                                                  &sdummy);
875                         } else {
876                                 pstate = RB_FIND(kdmsg_state_tree,
877                                                  &iocom->staterd_tree,
878                                                  &sdummy);
879                         }
880                         if (pstate == NULL) {
881                                 kdio_printf(iocom, 1, "%s\n",
882                                             "missing parent in "
883                                             "stacked trans");
884                                 error = EINVAL;
885                                 break;
886                         }
887                 } else {
888                         pstate = &iocom->state0;
889                 }
890
891                 /*
892                  * Allocate new state.
893                  *
894                  * msg->state becomes the owner of the ref we inherit from
895                  * freerd_stae.
896                  */
897                 kdmsg_state_drop(state);
898                 state = iocom->freerd_state;
899                 iocom->freerd_state = NULL;
900
901                 msg->state = state;             /* inherits freerd ref */
902                 state->parent = pstate;
903                 KKASSERT(state->iocom == iocom);
904                 state->flags |= KDMSG_STATE_RBINSERTED |
905                                 KDMSG_STATE_SUBINSERTED |
906                                 KDMSG_STATE_OPPOSITE;
907                 if (TAILQ_EMPTY(&pstate->subq))
908                         kdmsg_state_hold(pstate);/* states on pstate->subq */
909                 kdmsg_state_hold(state);        /* state on pstate->subq */
910                 kdmsg_state_hold(state);        /* state on rbtree */
911                 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
912                 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
913                 state->txcmd = DMSGF_REPLY;
914                 state->msgid = msg->any.head.msgid;
915                 state->flags &= ~KDMSG_STATE_NEW;
916                 RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
917                 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
918                 error = 0;
919                 break;
920         case DMSGF_DELETE:
921                 /*
922                  * Persistent state is expected but might not exist if an
923                  * ABORT+DELETE races the close.
924                  */
925                 if (state == &iocom->state0) {
926                         if (msg->any.head.cmd & DMSGF_ABORT) {
927                                 kdio_printf(iocom, 1, "%s\n",
928                                             "msgrx: "
929                                             "state already A");
930                                 error = EALREADY;
931                         } else {
932                                 kdio_printf(iocom, 1, "%s\n",
933                                             "msgrx: no state for DELETE");
934                                 error = EINVAL;
935                         }
936                         break;
937                 }
938
939                 /*
940                  * Handle another ABORT+DELETE case if the msgid has already
941                  * been reused.
942                  */
943                 if ((state->rxcmd & DMSGF_CREATE) == 0) {
944                         if (msg->any.head.cmd & DMSGF_ABORT) {
945                                 kdio_printf(iocom, 1, "%s\n",
946                                             "msgrx: state already B");
947                                 error = EALREADY;
948                         } else {
949                                 kdio_printf(iocom, 1, "%s\n",
950                                             "msgrx: state reused for DELETE");
951                                 error = EINVAL;
952                         }
953                         break;
954                 }
955                 error = 0;
956                 break;
957         default:
958                 /*
959                  * Check for mid-stream ABORT command received, otherwise
960                  * allow.
961                  */
962                 if (msg->any.head.cmd & DMSGF_ABORT) {
963                         if (state == &iocom->state0 ||
964                             (state->rxcmd & DMSGF_CREATE) == 0) {
965                                 error = EALREADY;
966                                 break;
967                         }
968                 }
969                 error = 0;
970                 break;
971         case DMSGF_REPLY | DMSGF_CREATE:
972         case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
973                 /*
974                  * When receiving a reply with CREATE set the original
975                  * persistent state message should already exist.
976                  */
977                 if (state == &iocom->state0) {
978                         kdio_printf(iocom, 1,
979                                     "msgrx: no state match for "
980                                     "REPLY cmd=%08x msgid=%016jx\n",
981                                     msg->any.head.cmd,
982                                     (intmax_t)msg->any.head.msgid);
983                         error = EINVAL;
984                         break;
985                 }
986                 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
987                 error = 0;
988                 break;
989         case DMSGF_REPLY | DMSGF_DELETE:
990                 /*
991                  * Received REPLY+ABORT+DELETE in case where msgid has
992                  * already been fully closed, ignore the message.
993                  */
994                 if (state == &iocom->state0) {
995                         if (msg->any.head.cmd & DMSGF_ABORT) {
996                                 error = EALREADY;
997                         } else {
998                                 kdio_printf(iocom, 1, "%s\n",
999                                             "msgrx: no state match "
1000                                             "for REPLY|DELETE");
1001                                 error = EINVAL;
1002                         }
1003                         break;
1004                 }
1005
1006                 /*
1007                  * Received REPLY+ABORT+DELETE in case where msgid has
1008                  * already been reused for an unrelated message,
1009                  * ignore the message.
1010                  */
1011                 if ((state->rxcmd & DMSGF_CREATE) == 0) {
1012                         if (msg->any.head.cmd & DMSGF_ABORT) {
1013                                 error = EALREADY;
1014                         } else {
1015                                 kdio_printf(iocom, 1, "%s\n",
1016                                             "msgrx: state reused "
1017                                             "for REPLY|DELETE");
1018                                 error = EINVAL;
1019                         }
1020                         break;
1021                 }
1022                 error = 0;
1023                 break;
1024         case DMSGF_REPLY:
1025                 /*
1026                  * Check for mid-stream ABORT reply received to sent command.
1027                  */
1028                 if (msg->any.head.cmd & DMSGF_ABORT) {
1029                         if (state == &iocom->state0 ||
1030                             (state->rxcmd & DMSGF_CREATE) == 0) {
1031                                 error = EALREADY;
1032                                 break;
1033                         }
1034                 }
1035                 error = 0;
1036                 break;
1037         }
1038
1039         /*
1040          * Calculate the easy-switch() transactional command.  Represents
1041          * the outer-transaction command for any transaction-create or
1042          * transaction-delete, and the inner message command for any
1043          * non-transaction or inside-transaction command.  tcmd will be
1044          * set to 0 if the message state is illegal.
1045          *
1046          * The two can be told apart because outer-transaction commands
1047          * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
1048          */
1049 done:
1050         if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
1051                 if (state != &iocom->state0) {
1052                         msg->tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1053                                     (msg->any.head.cmd & (DMSGF_CREATE |
1054                                                           DMSGF_DELETE |
1055                                                           DMSGF_REPLY));
1056                 } else {
1057                         msg->tcmd = 0;
1058                 }
1059         } else {
1060                 msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
1061         }
1062
1063         /*
1064          * Adjust the state for DELETE handling now, before making the
1065          * callback so we are atomic with other state updates.
1066          *
1067          * Subq/parent linkages are cleaned up after the callback.
1068          * If an error occurred the message is ignored and state is not
1069          * updated.
1070          */
1071         if ((state = msg->state) == NULL || error != 0) {
1072                 kdio_printf(iocom, 1,
1073                             "msgrx: state=%p error %d\n",
1074                             state, error);
1075         } else if (msg->any.head.cmd & DMSGF_DELETE) {
1076                 KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1077                 state->rxcmd |= DMSGF_DELETE;
1078                 if (state->txcmd & DMSGF_DELETE) {
1079                         KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1080                         if (state->rxcmd & DMSGF_REPLY) {
1081                                 KKASSERT(msg->any.head.cmd &
1082                                          DMSGF_REPLY);
1083                                 RB_REMOVE(kdmsg_state_tree,
1084                                           &iocom->statewr_tree, state);
1085                         } else {
1086                                 KKASSERT((msg->any.head.cmd &
1087                                           DMSGF_REPLY) == 0);
1088                                 RB_REMOVE(kdmsg_state_tree,
1089                                           &iocom->staterd_tree, state);
1090                         }
1091                         state->flags &= ~KDMSG_STATE_RBINSERTED;
1092                         kdmsg_state_drop(state);        /* state on rbtree */
1093                 }
1094         }
1095         lockmgr(&iocom->msglk, LK_RELEASE);
1096
1097         return (error);
1098 }
1099
1100 /*
1101  * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
1102  * This routine must call iocom->rcvmsg() for anything not automatically
1103  * handled.
1104  */
1105 static int
1106 kdmsg_autorxmsg(kdmsg_msg_t *msg)
1107 {
1108         kdmsg_iocom_t *iocom = msg->state->iocom;
1109         kdmsg_msg_t *rep;
1110         int error = 0;
1111         uint32_t cmd;
1112
1113         /*
1114          * Main switch processes transaction create/delete sequences only.
1115          * Use icmd (DELETEs use DMSG_LNK_ERROR
1116          *
1117          * NOTE: If processing in-transaction messages you generally want
1118          *       an inner switch on msg->any.head.cmd.
1119          */
1120         if (msg->state) {
1121                 cmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1122                       (msg->any.head.cmd & (DMSGF_CREATE |
1123                                             DMSGF_DELETE |
1124                                             DMSGF_REPLY));
1125         } else {
1126                 cmd = 0;
1127         }
1128
1129         switch(cmd) {
1130         case DMSG_LNK_PING:
1131                 /*
1132                  * Received ping, send reply
1133                  */
1134                 rep = kdmsg_msg_alloc(msg->state, DMSG_LNK_PING | DMSGF_REPLY,
1135                                       NULL, NULL);
1136                 kdmsg_msg_write(rep);
1137                 break;
1138         case DMSG_LNK_PING | DMSGF_REPLY:
1139                 /* ignore replies */
1140                 break;
1141         case DMSG_LNK_CONN | DMSGF_CREATE:
1142         case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
1143                 /*
1144                  * Received LNK_CONN transaction.  Transmit response and
1145                  * leave transaction open, which allows the other end to
1146                  * start to the SPAN protocol.
1147                  *
1148                  * Handle shim after acknowledging the CONN.
1149                  */
1150                 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1151                         if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1152                                 kdmsg_msg_result(msg, 0);
1153                                 if (iocom->auto_callback)
1154                                         iocom->auto_callback(msg);
1155                         } else {
1156                                 error = iocom->rcvmsg(msg);
1157                         }
1158                         break;
1159                 }
1160                 /* fall through */
1161         case DMSG_LNK_CONN | DMSGF_DELETE:
1162                 /*
1163                  * This message is usually simulated after a link is lost
1164                  * to clean up the transaction.
1165                  */
1166                 if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1167                         if (iocom->auto_callback)
1168                                 iocom->auto_callback(msg);
1169                         kdmsg_msg_reply(msg, 0);
1170                 } else {
1171                         error = iocom->rcvmsg(msg);
1172                 }
1173                 break;
1174         case DMSG_LNK_SPAN | DMSGF_CREATE:
1175         case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1176                 /*
1177                  * Received LNK_SPAN transaction.  We do not have to respond
1178                  * (except on termination), but we must leave the transaction
1179                  * open.
1180                  *
1181                  * Handle shim after acknowledging the SPAN.
1182                  */
1183                 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1184                         if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1185                                 if (iocom->auto_callback)
1186                                         iocom->auto_callback(msg);
1187                                 break;
1188                         }
1189                         /* fall through */
1190                 } else {
1191                         error = iocom->rcvmsg(msg);
1192                         break;
1193                 }
1194                 /* fall through */
1195         case DMSG_LNK_SPAN | DMSGF_DELETE:
1196                 /*
1197                  * Process shims (auto_callback) before cleaning up the
1198                  * circuit structure and closing the transactions.  Device
1199                  * driver should ensure that the circuit is not used after
1200                  * the auto_callback() returns.
1201                  *
1202                  * Handle shim before closing the SPAN transaction.
1203                  */
1204                 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1205                         if (iocom->auto_callback)
1206                                 iocom->auto_callback(msg);
1207                         kdmsg_msg_reply(msg, 0);
1208                 } else {
1209                         error = iocom->rcvmsg(msg);
1210                 }
1211                 break;
1212         default:
1213                 /*
1214                  * Anything unhandled goes into rcvmsg.
1215                  *
1216                  * NOTE: Replies to link-level messages initiated by our side
1217                  *       are handled by the state callback, they are NOT
1218                  *       handled here.
1219                  */
1220                 error = iocom->rcvmsg(msg);
1221                 break;
1222         }
1223         return (error);
1224 }
1225
1226 /*
1227  * Post-receive-handling message and state cleanup.  This routine is called
1228  * after the state function handling/callback to properly dispose of the
1229  * message and unlink the state's parent/subq linkage if the state is
1230  * completely closed.
1231  *
1232  * msglk is not held.
1233  */
1234 static
1235 void
1236 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1237 {
1238         kdmsg_state_t *state = msg->state;
1239         kdmsg_iocom_t *iocom = state->iocom;
1240
1241         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1242         if (state != &iocom->state0) {
1243                 /*
1244                  * When terminating a transaction (in either direction), all
1245                  * sub-states are aborted.
1246                  */
1247                 if ((msg->any.head.cmd & DMSGF_DELETE) &&
1248                     TAILQ_FIRST(&msg->state->subq)) {
1249                         kdio_printf(iocom, 2,
1250                                     "simulate failure for substates of "
1251                                     "state %p cmd %08x/%08x\n",
1252                                     msg->state,
1253                                     msg->state->rxcmd,
1254                                     msg->state->txcmd);
1255                         kdmsg_simulate_failure(msg->state,
1256                                                0, DMSG_ERR_LOSTLINK);
1257                 }
1258
1259                 /*
1260                  * Once the state is fully closed we can (try to) remove it
1261                  * from the subq topology.
1262                  */
1263                 if ((state->flags & KDMSG_STATE_SUBINSERTED) &&
1264                     (state->rxcmd & DMSGF_DELETE) &&
1265                     (state->txcmd & DMSGF_DELETE)) {
1266                         /* 
1267                          * Remove parent linkage if state is completely closed.
1268                          */
1269                         kdmsg_subq_delete(state);
1270                 }
1271         }
1272         kdmsg_msg_free(msg);
1273
1274         lockmgr(&iocom->msglk, LK_RELEASE);
1275 }
1276
1277 /*
1278  * Remove state from its parent's subq.  This can wind up recursively
1279  * dropping the parent upward.
1280  *
1281  * NOTE: Once we drop the parent, our pstate pointer may become invalid.
1282  */
1283 static
1284 void
1285 kdmsg_subq_delete(kdmsg_state_t *state)
1286 {
1287         kdmsg_state_t *pstate;
1288
1289         if (state->flags & KDMSG_STATE_SUBINSERTED) {
1290                 pstate = state->parent;
1291                 KKASSERT(pstate);
1292                 if (pstate->scan == state)
1293                         pstate->scan = NULL;
1294                 TAILQ_REMOVE(&pstate->subq, state, entry);
1295                 state->flags &= ~KDMSG_STATE_SUBINSERTED;
1296                 state->parent = NULL;
1297                 if (TAILQ_EMPTY(&pstate->subq)) {
1298                         kdmsg_state_drop(pstate);/* pstate->subq */
1299                 }
1300                 pstate = NULL;                   /* safety */
1301                 kdmsg_state_drop(state);         /* pstate->subq */
1302         } else {
1303                 KKASSERT(state->parent == NULL);
1304         }
1305 }
1306
1307 /*
1308  * Simulate receiving a message which terminates an active transaction
1309  * state.  Our simulated received message must set DELETE and may also
1310  * have to set CREATE.  It must also ensure that all fields are set such
1311  * that the receive handling code can find the state (kdmsg_state_msgrx())
1312  * or an endless loop will ensue.
1313  *
1314  * This is used when the other end of the link is dead so the device driver
1315  * gets a completed transaction for all pending states.
1316  *
1317  * Called with iocom locked.
1318  */
1319 static
1320 void
1321 kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error)
1322 {
1323         kdmsg_state_t *substate;
1324
1325         kdmsg_state_hold(state);                /* aborting */
1326
1327         /*
1328          * Abort parent state first. Parent will not actually disappear
1329          * until children are gone.  Device drivers must handle the situation.
1330          * The advantage of this is that device drivers can flag the situation
1331          * as an interlock against new operations on dying states.  And since
1332          * device operations are often asynchronous anyway, this sequence of
1333          * events works out better.
1334          */
1335         if (meto)
1336                 kdmsg_state_abort(state);
1337
1338         /*
1339          * Recurse through any children.
1340          */
1341 again:
1342         TAILQ_FOREACH(substate, &state->subq, entry) {
1343                 if (substate->flags & KDMSG_STATE_ABORTING)
1344                         continue;
1345                 state->scan = substate;
1346                 kdmsg_simulate_failure(substate, 1, error);
1347                 if (state->scan != substate)
1348                         goto again;
1349         }
1350         kdmsg_state_drop(state);                /* aborting */
1351 }
1352
1353 static
1354 void
1355 kdmsg_state_abort(kdmsg_state_t *state)
1356 {
1357         kdmsg_msg_t *msg;
1358
1359         /*
1360          * Set ABORTING and DYING, return if already set.  If the state was
1361          * just allocated we defer the abort operation until the related
1362          * message is processed.
1363          */
1364         KKASSERT((state->flags & KDMSG_STATE_ABORTING) == 0);
1365         if (state->flags & KDMSG_STATE_ABORTING)
1366                 return;
1367         state->flags |= KDMSG_STATE_ABORTING;
1368         kdmsg_state_dying(state);
1369         if (state->flags & KDMSG_STATE_NEW) {
1370                 kdio_printf(iocom, 5,
1371                             "kdmsg_state_abort(0): state %p rxcmd %08x "
1372                             "txcmd %08x flags %08x - in NEW state\n",
1373                             state, state->rxcmd,
1374                             state->txcmd, state->flags);
1375                 return;
1376         }
1377
1378         /*
1379          * NOTE: The DELETE flag might already be set due to an early
1380          *       termination.
1381          *
1382          * NOTE: Args to kdmsg_msg_alloc() to avoid dynamic state allocation.
1383          *
1384          * NOTE: We are simulating a received message using our state
1385          *       (vs a message generated by the other side using its state),
1386          *       so we must invert DMSGF_REVTRANS and DMSGF_REVCIRC.
1387          */
1388         kdio_printf(iocom, 5, 
1389                     "kdmsg_state_abort(1): state %p rxcmd %08x txcmd %08x\n",
1390                     state, state->rxcmd, state->txcmd);
1391         if ((state->rxcmd & DMSGF_DELETE) == 0) {
1392                 msg = kdmsg_msg_alloc(state, DMSG_LNK_ERROR, NULL, NULL);
1393                 if ((state->rxcmd & DMSGF_CREATE) == 0)
1394                         msg->any.head.cmd |= DMSGF_CREATE;
1395                 msg->any.head.cmd |= DMSGF_DELETE |
1396                                      (state->rxcmd & DMSGF_REPLY);
1397                 msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
1398                 msg->any.head.error = DMSG_ERR_LOSTLINK;
1399                 kdio_printf(iocom, 5,
1400                             "kdmsg_state_abort(a): state %p msgcmd %08x\n",
1401                             state, msg->any.head.cmd);
1402                 /* circuit not initialized */
1403                 lockmgr(&state->iocom->msglk, LK_RELEASE);
1404                 kdmsg_msg_receive_handling(msg);
1405                 lockmgr(&state->iocom->msglk, LK_EXCLUSIVE);
1406                 msg = NULL;
1407         }
1408         kdio_printf(iocom, 5,
1409                     "kdmsg_state_abort(2): state %p rxcmd %08x txcmd %08x\n",
1410                     state, state->rxcmd, state->txcmd);
1411 }
1412
1413 /*
1414  * Recursively sets KDMSG_STATE_DYING on state and all sub-states, preventing
1415  * the transmission of any new messages on these states.  This is done
1416  * atomically when parent state is terminating, whereas setting ABORTING is
1417  * not atomic and can leak races.
1418  */
1419 static
1420 void
1421 kdmsg_state_dying(kdmsg_state_t *state)
1422 {
1423         kdmsg_state_t *scan;
1424
1425         if ((state->flags & KDMSG_STATE_DYING) == 0) {
1426                 state->flags |= KDMSG_STATE_DYING;
1427                 TAILQ_FOREACH(scan, &state->subq, entry)
1428                         kdmsg_state_dying(scan);
1429         }
1430 }
1431
1432 /*
1433  * Process state tracking for a message prior to transmission.
1434  *
1435  * Called with msglk held and the msg dequeued.  Returns non-zero if
1436  * the message is bad and should be deleted by the caller.
1437  *
1438  * One-off messages are usually with dummy state and msg->state may be NULL
1439  * in this situation.
1440  *
1441  * New transactions (when CREATE is set) will insert the state.
1442  *
1443  * May request that caller discard the message by setting *discardp to 1.
1444  * A NULL state may be returned in this case.
1445  */
1446 static
1447 int
1448 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1449 {
1450         kdmsg_iocom_t *iocom = msg->state->iocom;
1451         kdmsg_state_t *state;
1452         int error;
1453
1454         /*
1455          * Make sure a state structure is ready to go in case we need a new
1456          * one.  This is the only routine which uses freewr_state so no
1457          * races are possible.
1458          */
1459         if ((state = iocom->freewr_state) == NULL) {
1460                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1461                 state->flags = KDMSG_STATE_DYNAMIC;
1462                 state->iocom = iocom;
1463                 state->refs = 1;
1464                 TAILQ_INIT(&state->subq);
1465                 iocom->freewr_state = state;
1466         }
1467
1468         /*
1469          * Lock RB tree.  If persistent state is present it will have already
1470          * been assigned to msg.
1471          */
1472         state = msg->state;
1473
1474         /*
1475          * Short-cut one-off or mid-stream messages (state may be NULL).
1476          */
1477         if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1478                                   DMSGF_ABORT)) == 0) {
1479                 return(0);
1480         }
1481
1482
1483         /*
1484          * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1485          * inside the case statements.
1486          */
1487         switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1488                                     DMSGF_REPLY)) {
1489         case DMSGF_CREATE:
1490         case DMSGF_CREATE | DMSGF_DELETE:
1491                 /*
1492                  * Insert the new persistent message state and mark
1493                  * half-closed if DELETE is set.  Since this is a new
1494                  * message it isn't possible to transition into the fully
1495                  * closed state here.
1496                  *
1497                  * XXX state must be assigned and inserted by
1498                  *     kdmsg_msg_write().  txcmd is assigned by us
1499                  *     on-transmit.
1500                  */
1501                 KKASSERT(state != NULL);
1502                 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1503                 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1504                 state->rxcmd = DMSGF_REPLY;
1505                 state->flags &= ~KDMSG_STATE_NEW;
1506                 error = 0;
1507                 break;
1508         case DMSGF_DELETE:
1509                 /*
1510                  * Sent ABORT+DELETE in case where msgid has already
1511                  * been fully closed, ignore the message.
1512                  */
1513                 if (state == &iocom->state0) {
1514                         if (msg->any.head.cmd & DMSGF_ABORT) {
1515                                 error = EALREADY;
1516                         } else {
1517                                 kdio_printf(iocom, 1,
1518                                         "msgtx: no state match "
1519                                         "for DELETE cmd=%08x msgid=%016jx\n",
1520                                         msg->any.head.cmd,
1521                                         (intmax_t)msg->any.head.msgid);
1522                                 error = EINVAL;
1523                         }
1524                         break;
1525                 }
1526
1527                 /*
1528                  * Sent ABORT+DELETE in case where msgid has
1529                  * already been reused for an unrelated message,
1530                  * ignore the message.
1531                  */
1532                 if ((state->txcmd & DMSGF_CREATE) == 0) {
1533                         if (msg->any.head.cmd & DMSGF_ABORT) {
1534                                 error = EALREADY;
1535                         } else {
1536                                 kdio_printf(iocom, 1, "%s\n",
1537                                             "msgtx: state reused "
1538                                             "for DELETE");
1539                                 error = EINVAL;
1540                         }
1541                         break;
1542                 }
1543                 error = 0;
1544                 break;
1545         default:
1546                 /*
1547                  * Check for mid-stream ABORT command sent
1548                  */
1549                 if (msg->any.head.cmd & DMSGF_ABORT) {
1550                         if (state == &state->iocom->state0 ||
1551                             (state->txcmd & DMSGF_CREATE) == 0) {
1552                                 error = EALREADY;
1553                                 break;
1554                         }
1555                 }
1556                 error = 0;
1557                 break;
1558         case DMSGF_REPLY | DMSGF_CREATE:
1559         case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1560                 /*
1561                  * When transmitting a reply with CREATE set the original
1562                  * persistent state message should already exist.
1563                  */
1564                 if (state == &state->iocom->state0) {
1565                         kdio_printf(iocom, 1, "%s\n",
1566                                     "msgtx: no state match "
1567                                     "for REPLY | CREATE");
1568                         error = EINVAL;
1569                         break;
1570                 }
1571                 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1572                 error = 0;
1573                 break;
1574         case DMSGF_REPLY | DMSGF_DELETE:
1575                 /*
1576                  * When transmitting a reply with DELETE set the original
1577                  * persistent state message should already exist.
1578                  *
1579                  * This is very similar to the REPLY|CREATE|* case except
1580                  * txcmd is already stored, so we just add the DELETE flag.
1581                  *
1582                  * Sent REPLY+ABORT+DELETE in case where msgid has
1583                  * already been fully closed, ignore the message.
1584                  */
1585                 if (state == &state->iocom->state0) {
1586                         if (msg->any.head.cmd & DMSGF_ABORT) {
1587                                 error = EALREADY;
1588                         } else {
1589                                 kdio_printf(iocom, 1, "%s\n",
1590                                             "msgtx: no state match "
1591                                             "for REPLY | DELETE");
1592                                 error = EINVAL;
1593                         }
1594                         break;
1595                 }
1596
1597                 /*
1598                  * Sent REPLY+ABORT+DELETE in case where msgid has already
1599                  * been reused for an unrelated message, ignore the message.
1600                  */
1601                 if ((state->txcmd & DMSGF_CREATE) == 0) {
1602                         if (msg->any.head.cmd & DMSGF_ABORT) {
1603                                 error = EALREADY;
1604                         } else {
1605                                 kdio_printf(iocom, 1, "%s\n",
1606                                             "msgtx: state reused "
1607                                             "for REPLY | DELETE");
1608                                 error = EINVAL;
1609                         }
1610                         break;
1611                 }
1612                 error = 0;
1613                 break;
1614         case DMSGF_REPLY:
1615                 /*
1616                  * Check for mid-stream ABORT reply sent.
1617                  *
1618                  * One-off REPLY messages are allowed for e.g. status updates.
1619                  */
1620                 if (msg->any.head.cmd & DMSGF_ABORT) {
1621                         if (state == &state->iocom->state0 ||
1622                             (state->txcmd & DMSGF_CREATE) == 0) {
1623                                 error = EALREADY;
1624                                 break;
1625                         }
1626                 }
1627                 error = 0;
1628                 break;
1629         }
1630
1631         /*
1632          * Set interlock (XXX hack) in case the send side blocks and a
1633          * response is returned before kdmsg_state_cleanuptx() can be
1634          * run.
1635          */
1636         if (state && error == 0)
1637                 state->flags |= KDMSG_STATE_INTERLOCK;
1638
1639         return (error);
1640 }
1641
1642 /*
1643  * Called with iocom locked.
1644  */
1645 static
1646 void
1647 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1648 {
1649         kdmsg_iocom_t *iocom = msg->state->iocom;
1650         kdmsg_state_t *state;
1651
1652         if ((state = msg->state) == NULL) {
1653                 kdmsg_msg_free(msg);
1654                 return;
1655         }
1656
1657         /*
1658          * Clear interlock (XXX hack) in case the send side blocks and a
1659          * response is returned in the other thread before
1660          * kdmsg_state_cleanuptx() can be run.  We maintain our hold on
1661          * iocom->msglk so we can do this before completing our task.
1662          */
1663         if (state->flags & KDMSG_STATE_SIGNAL) {
1664                 kdio_printf(iocom, 1, "state %p interlock!\n", state);
1665                 wakeup(state);
1666         }
1667         state->flags &= ~(KDMSG_STATE_INTERLOCK | KDMSG_STATE_SIGNAL);
1668         kdmsg_state_hold(state);
1669
1670         if (msg->any.head.cmd & DMSGF_DELETE) {
1671                 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1672                 state->txcmd |= DMSGF_DELETE;
1673                 if (state->rxcmd & DMSGF_DELETE) {
1674                         KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1675                         if (state->txcmd & DMSGF_REPLY) {
1676                                 KKASSERT(msg->any.head.cmd &
1677                                          DMSGF_REPLY);
1678                                 RB_REMOVE(kdmsg_state_tree,
1679                                           &iocom->staterd_tree, state);
1680                         } else {
1681                                 KKASSERT((msg->any.head.cmd &
1682                                           DMSGF_REPLY) == 0);
1683                                 RB_REMOVE(kdmsg_state_tree,
1684                                           &iocom->statewr_tree, state);
1685                         }
1686                         state->flags &= ~KDMSG_STATE_RBINSERTED;
1687
1688                         /*
1689                          * The subq recursion is used for parent linking and
1690                          * scanning the topology for aborts, we can only
1691                          * remove leafs.  The circuit is effectively dead now,
1692                          * but topology won't be torn down until all of its
1693                          * children have finished/aborted.
1694                          *
1695                          * This is particularly important for end-point
1696                          * devices which might need to access private data
1697                          * in parent states.  Out of order disconnects can
1698                          * occur if an end-point device is processing a
1699                          * message transaction asynchronously because abort
1700                          * requests are basically synchronous and it probably
1701                          * isn't convenient (or possible) for the end-point
1702                          * to abort an asynchronous operation.
1703                          */
1704                         if (TAILQ_EMPTY(&state->subq))
1705                                 kdmsg_subq_delete(state);
1706                         kdmsg_msg_free(msg);
1707                         kdmsg_state_drop(state);   /* state on rbtree */
1708                 } else {
1709                         kdmsg_msg_free(msg);
1710                 }
1711         } else {
1712                 kdmsg_msg_free(msg);
1713         }
1714
1715         /*
1716          * Deferred abort after transmission.
1717          */
1718         if ((state->flags & (KDMSG_STATE_ABORTING | KDMSG_STATE_DYING)) &&
1719             (state->rxcmd & DMSGF_DELETE) == 0) {
1720                 kdio_printf(iocom, 5,
1721                             "kdmsg_state_cleanuptx: state=%p "
1722                             "executing deferred abort\n",
1723                             state);
1724                 state->flags &= ~KDMSG_STATE_ABORTING;
1725                 kdmsg_state_abort(state);
1726         }
1727         kdmsg_state_drop(state);
1728 }
1729
1730 static
1731 void
1732 _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1733 {
1734         atomic_add_int(&state->refs, 1);
1735 #if KDMSG_DEBUG
1736         kd_printf(4, "state %p +%d\t%s:%d\n", state, state->refs, file, line);
1737 #endif
1738 }
1739
1740 static
1741 void
1742 _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1743 {
1744         KKASSERT(state->refs > 0);
1745 #if KDMSG_DEBUG
1746         kd_printf(4, "state %p -%d\t%s:%d\n", state, state->refs, file, line);
1747 #endif
1748         if (atomic_fetchadd_int(&state->refs, -1) == 1)
1749                 kdmsg_state_free(state);
1750 }
1751
1752 static
1753 void
1754 kdmsg_state_free(kdmsg_state_t *state)
1755 {
1756         kdmsg_iocom_t *iocom = state->iocom;
1757
1758         KKASSERT((state->flags & KDMSG_STATE_RBINSERTED) == 0);
1759         KKASSERT((state->flags & KDMSG_STATE_SUBINSERTED) == 0);
1760         KKASSERT(TAILQ_EMPTY(&state->subq));
1761
1762         if (state != &state->iocom->state0)
1763                 kfree(state, iocom->mmsg);
1764 }
1765
1766 kdmsg_msg_t *
1767 kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
1768                 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1769 {
1770         kdmsg_iocom_t *iocom = state->iocom;
1771         kdmsg_state_t *pstate;
1772         kdmsg_msg_t *msg;
1773         size_t hbytes;
1774
1775         KKASSERT(iocom != NULL);
1776         hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1777         msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1778                       iocom->mmsg, M_WAITOK | M_ZERO);
1779         msg->hdr_size = hbytes;
1780
1781         if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
1782                 /*
1783                  * New transaction, requires tracking state and a unique
1784                  * msgid to be allocated.
1785                  *
1786                  * It is possible to race a circuit failure, inherit the
1787                  * parent's STATE_DYING flag to trigger an abort sequence
1788                  * in the transmit path.  By not inheriting ABORTING the
1789                  * abort sequence can recurse.
1790                  *
1791                  * NOTE: The transactions has not yet been initiated so we
1792                  *       cannot set DMSGF_CREATE/DELETE bits in txcmd or rxcmd.
1793                  *       We have to properly setup DMSGF_REPLY, however.
1794                  */
1795                 pstate = state;
1796                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1797                 TAILQ_INIT(&state->subq);
1798                 state->iocom = iocom;
1799                 state->parent = pstate;
1800                 state->flags = KDMSG_STATE_DYNAMIC |
1801                                KDMSG_STATE_NEW;
1802                 state->func = func;
1803                 state->any.any = data;
1804                 state->msgid = (uint64_t)(uintptr_t)state;
1805                 /*msg->any.head.msgid = state->msgid;XXX*/
1806
1807                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1808                 if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1809                         panic("duplicate msgid allocated");
1810                 if (TAILQ_EMPTY(&pstate->subq))
1811                         kdmsg_state_hold(pstate);/* pstate->subq */
1812                 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
1813                 state->flags |= KDMSG_STATE_RBINSERTED |
1814                                 KDMSG_STATE_SUBINSERTED;
1815                 state->flags |= pstate->flags & KDMSG_STATE_DYING;
1816                 kdmsg_state_hold(state);        /* pstate->subq */
1817                 kdmsg_state_hold(state);        /* state on rbtree */
1818                 kdmsg_state_hold(state);        /* msg->state */
1819                 lockmgr(&iocom->msglk, LK_RELEASE);
1820         } else {
1821                 pstate = state->parent;
1822                 KKASSERT(pstate != NULL);
1823                 kdmsg_state_hold(state);        /* msg->state */
1824         }
1825
1826         if (state->flags & KDMSG_STATE_OPPOSITE)
1827                 cmd |= DMSGF_REVTRANS;
1828         if (pstate->flags & KDMSG_STATE_OPPOSITE)
1829                 cmd |= DMSGF_REVCIRC;
1830
1831         msg->any.head.magic = DMSG_HDR_MAGIC;
1832         msg->any.head.cmd = cmd;
1833         msg->any.head.msgid = state->msgid;
1834         msg->any.head.circuit = pstate->msgid;
1835         msg->state = state;
1836
1837         return (msg);
1838 }
1839
1840 void
1841 kdmsg_msg_free(kdmsg_msg_t *msg)
1842 {
1843         kdmsg_iocom_t *iocom = msg->state->iocom;
1844         kdmsg_state_t *state;
1845
1846         if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1847             msg->aux_data && msg->aux_size) {
1848                 kfree(msg->aux_data, iocom->mmsg);
1849                 msg->aux_data = NULL;
1850                 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1851         }
1852         if ((state = msg->state) != NULL) {
1853                 msg->state = NULL;
1854                 kdmsg_state_drop(state);        /* msg->state */
1855         }
1856         msg->aux_data = NULL;
1857         msg->aux_size = 0;
1858
1859         kfree(msg, iocom->mmsg);
1860 }
1861
1862 void
1863 kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data)
1864 {
1865         if (msg->flags & KDMSG_FLAG_AUXALLOC) {
1866                 data->aux_data = msg->aux_data;
1867                 data->aux_size = msg->aux_size;
1868                 data->iocom = msg->state->iocom;
1869                 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1870         } else {
1871                 data->aux_data = NULL;
1872                 data->aux_size = 0;
1873                 data->iocom = msg->state->iocom;
1874         }
1875 }
1876
1877 void
1878 kdmsg_free_aux_data(kdmsg_data_t *data)
1879 {
1880         if (data->aux_data) {
1881                 kfree(data->aux_data, data->iocom->mmsg);
1882                 data->aux_data = NULL;
1883         }
1884 }
1885
1886 /*
1887  * Indexed messages are stored in a red-black tree indexed by their
1888  * msgid.  Only persistent messages are indexed.
1889  */
1890 int
1891 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1892 {
1893         if (state1->iocom < state2->iocom)
1894                 return(-1);
1895         if (state1->iocom > state2->iocom)
1896                 return(1);
1897         if (state1->msgid < state2->msgid)
1898                 return(-1);
1899         if (state1->msgid > state2->msgid)
1900                 return(1);
1901         return(0);
1902 }
1903
1904 /*
1905  * Write a message.  All requisit command flags have been set.
1906  *
1907  * If msg->state is non-NULL the message is written to the existing
1908  * transaction.  msgid will be set accordingly.
1909  *
1910  * If msg->state is NULL and CREATE is set new state is allocated and
1911  * (func, data) is installed.  A msgid is assigned.
1912  *
1913  * If msg->state is NULL and CREATE is not set the message is assumed
1914  * to be a one-way message.  The originator must assign the msgid
1915  * (or leave it 0, which is typical.
1916  *
1917  * This function merely queues the message to the management thread, it
1918  * does not write to the message socket/pipe.
1919  */
1920 void
1921 kdmsg_msg_write(kdmsg_msg_t *msg)
1922 {
1923         kdmsg_iocom_t *iocom = msg->state->iocom;
1924
1925         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1926         kdmsg_msg_write_locked(iocom, msg);
1927         lockmgr(&iocom->msglk, LK_RELEASE);
1928 }
1929
1930 static void
1931 kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg)
1932 {
1933         kdmsg_state_t *state;
1934
1935         if (msg->state) {
1936                 /*
1937                  * Continuance or termination of existing transaction.
1938                  * The transaction could have been initiated by either end.
1939                  *
1940                  * (Function callback and aux data for the receive side can
1941                  * be replaced or left alone).
1942                  */
1943                 state = msg->state;
1944                 msg->any.head.msgid = state->msgid;
1945         } else {
1946                 /*
1947                  * One-off message (always uses msgid 0 to distinguish
1948                  * between a possibly lost in-transaction message due to
1949                  * competing aborts and a real one-off message?)
1950                  */
1951                 state = NULL;
1952                 msg->any.head.msgid = 0;
1953         }
1954
1955         /*
1956          * For stateful messages, if the circuit is dead or dying we have
1957          * to abort the potentially newly-created state and discard the
1958          * message.
1959          *
1960          * - We must discard the message because the other end will not
1961          *   be expecting any more messages over the dead or dying circuit
1962          *   and might not be able to receive them.
1963          *
1964          * - We abort the state by simulating a failure to generate a fake
1965          *   incoming DELETE.  This will trigger the state callback and allow
1966          *   the device to clean things up and reply, closing the outgoing
1967          *   direction and allowing the state to be freed.
1968          *
1969          * This situation occurs quite often, particularly as SPANs stabilize.
1970          * End-points must do the right thing.
1971          */
1972         if (state) {
1973                 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1974                 if (state->flags & KDMSG_STATE_DYING) {
1975 #if 0
1976                 if ((state->flags & KDMSG_STATE_DYING) ||
1977                     (state->parent->txcmd & DMSGF_DELETE) ||
1978                     (state->parent->flags & KDMSG_STATE_DYING)) {
1979 #endif
1980                         kdio_printf(iocom, 4,
1981                                     "kdmsg_msg_write: Write to dying circuit "
1982                                     "state=%p "
1983                                     "ptxcmd=%08x prxcmd=%08x flags=%08x\n",
1984                                     state,
1985                                     state->parent->rxcmd,
1986                                     state->parent->txcmd,
1987                                     state->parent->flags);
1988                         kdmsg_state_hold(state);
1989                         kdmsg_state_msgtx(msg);
1990                         kdmsg_state_cleanuptx(msg);
1991                         kdmsg_state_drop(state);
1992                         return;
1993                 }
1994         }
1995
1996         /*
1997          * Finish up the msg fields.  Note that msg->aux_size and the
1998          * aux_bytes stored in the message header represent the unaligned
1999          * (actual) bytes of data, but the buffer is sized to an aligned
2000          * size and the CRC is generated over the aligned length.
2001          */
2002         msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
2003         ++iocom->msg_seq;
2004
2005         if (msg->aux_data && msg->aux_size) {
2006                 uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
2007
2008                 msg->any.head.aux_bytes = msg->aux_size;
2009                 msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
2010         }
2011         msg->any.head.hdr_crc = 0;
2012         msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
2013
2014         /*
2015          * If termination races new message senders we must drain the
2016          * message immediately instead of queue it.
2017          */
2018         if (iocom->flags & KDMSG_IOCOMF_EXITNOACC)
2019                 kdmsg_drain_msg(msg);
2020         else
2021                 TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
2022
2023         if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
2024                 atomic_clear_int(&iocom->msg_ctl,
2025                                  KDMSG_CLUSTERCTL_SLEEPING);
2026                 wakeup(&iocom->msg_ctl);
2027         }
2028 }
2029
2030 /*
2031  * Reply to a message and terminate our side of the transaction.
2032  *
2033  * If msg->state is non-NULL we are replying to a one-way message.
2034  */
2035 void
2036 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
2037 {
2038         kdmsg_state_t *state = msg->state;
2039         kdmsg_msg_t *nmsg;
2040         uint32_t cmd;
2041
2042         /*
2043          * Reply with a simple error code and terminate the transaction.
2044          */
2045         cmd = DMSG_LNK_ERROR;
2046
2047         /*
2048          * Check if our direction has even been initiated yet, set CREATE.
2049          *
2050          * Check what direction this is (command or reply direction).  Note
2051          * that txcmd might not have been initiated yet.
2052          *
2053          * If our direction has already been closed we just return without
2054          * doing anything.
2055          */
2056         if (state != &state->iocom->state0) {
2057                 if (state->txcmd & DMSGF_DELETE)
2058                         return;
2059                 if ((state->txcmd & DMSGF_CREATE) == 0)
2060                         cmd |= DMSGF_CREATE;
2061                 if (state->txcmd & DMSGF_REPLY)
2062                         cmd |= DMSGF_REPLY;
2063                 cmd |= DMSGF_DELETE;
2064         } else {
2065                 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2066                         cmd |= DMSGF_REPLY;
2067         }
2068
2069         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2070         nmsg->any.head.error = error;
2071         kdmsg_msg_write(nmsg);
2072 }
2073
2074 /*
2075  * Reply to a message and continue our side of the transaction.
2076  *
2077  * If msg->state is non-NULL we are replying to a one-way message and this
2078  * function degenerates into the same as kdmsg_msg_reply().
2079  */
2080 void
2081 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
2082 {
2083         kdmsg_state_t *state = msg->state;
2084         kdmsg_msg_t *nmsg;
2085         uint32_t cmd;
2086
2087         /*
2088          * Return a simple result code, do NOT terminate the transaction.
2089          */
2090         cmd = DMSG_LNK_ERROR;
2091
2092         /*
2093          * Check if our direction has even been initiated yet, set CREATE.
2094          *
2095          * Check what direction this is (command or reply direction).  Note
2096          * that txcmd might not have been initiated yet.
2097          *
2098          * If our direction has already been closed we just return without
2099          * doing anything.
2100          */
2101         if (state != &state->iocom->state0) {
2102                 if (state->txcmd & DMSGF_DELETE)
2103                         return;
2104                 if ((state->txcmd & DMSGF_CREATE) == 0)
2105                         cmd |= DMSGF_CREATE;
2106                 if (state->txcmd & DMSGF_REPLY)
2107                         cmd |= DMSGF_REPLY;
2108                 /* continuing transaction, do not set MSGF_DELETE */
2109         } else {
2110                 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2111                         cmd |= DMSGF_REPLY;
2112         }
2113
2114         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2115         nmsg->any.head.error = error;
2116         kdmsg_msg_write(nmsg);
2117 }
2118
2119 /*
2120  * Reply to a message and terminate our side of the transaction.
2121  *
2122  * If msg->state is non-NULL we are replying to a one-way message.
2123  */
2124 void
2125 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
2126 {
2127         kdmsg_msg_t *nmsg;
2128         uint32_t cmd;
2129
2130         /*
2131          * Reply with a simple error code and terminate the transaction.
2132          */
2133         cmd = DMSG_LNK_ERROR;
2134
2135         /*
2136          * Check if our direction has even been initiated yet, set CREATE.
2137          *
2138          * Check what direction this is (command or reply direction).  Note
2139          * that txcmd might not have been initiated yet.
2140          *
2141          * If our direction has already been closed we just return without
2142          * doing anything.
2143          */
2144         KKASSERT(state);
2145         if (state->txcmd & DMSGF_DELETE)
2146                 return;
2147         if ((state->txcmd & DMSGF_CREATE) == 0)
2148                 cmd |= DMSGF_CREATE;
2149         if (state->txcmd & DMSGF_REPLY)
2150                 cmd |= DMSGF_REPLY;
2151         cmd |= DMSGF_DELETE;
2152
2153         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2154         nmsg->any.head.error = error;
2155         kdmsg_msg_write(nmsg);
2156 }
2157
2158 /*
2159  * Reply to a message and continue our side of the transaction.
2160  *
2161  * If msg->state is non-NULL we are replying to a one-way message and this
2162  * function degenerates into the same as kdmsg_msg_reply().
2163  */
2164 void
2165 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
2166 {
2167         kdmsg_msg_t *nmsg;
2168         uint32_t cmd;
2169
2170         /*
2171          * Return a simple result code, do NOT terminate the transaction.
2172          */
2173         cmd = DMSG_LNK_ERROR;
2174
2175         /*
2176          * Check if our direction has even been initiated yet, set CREATE.
2177          *
2178          * Check what direction this is (command or reply direction).  Note
2179          * that txcmd might not have been initiated yet.
2180          *
2181          * If our direction has already been closed we just return without
2182          * doing anything.
2183          */
2184         KKASSERT(state);
2185         if (state->txcmd & DMSGF_DELETE)
2186                 return;
2187         if ((state->txcmd & DMSGF_CREATE) == 0)
2188                 cmd |= DMSGF_CREATE;
2189         if (state->txcmd & DMSGF_REPLY)
2190                 cmd |= DMSGF_REPLY;
2191         /* continuing transaction, do not set MSGF_DELETE */
2192
2193         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2194         nmsg->any.head.error = error;
2195         kdmsg_msg_write(nmsg);
2196 }