kernel - Remove some kdmsg debugging
[dragonfly.git] / sys / kern / kern_dmsg.c
1 /*-
2  * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * TODO: txcmd CREATE state is deferred by tx msgq, need to calculate
36  *       a streaming response.  See subr_diskiocom()'s diskiodone().
37  */
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/sysctl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/priv.h>
53 #include <sys/thread.h>
54 #include <sys/globaldata.h>
55 #include <sys/limits.h>
56
57 #include <sys/dmsg.h>
58
59 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
60
61 SYSCTL_NODE(, OID_AUTO, kdmsg, CTLFLAG_RW, 0, "kdmsg");
62 static int kdmsg_debug = 1;
63 SYSCTL_INT(_kdmsg, OID_AUTO, debug, CTLFLAG_RW, &kdmsg_debug, 0,
64            "Set debug level for kernel dmsg layer");
65
66 #define kd_printf(level, ctl, ...)              \
67         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
68
69 #define kdio_printf(iocom, level, ctl, ...)      \
70         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
71
72 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
73 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
74 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
75 static void kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg);
76 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
77 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
78 static void kdmsg_subq_delete(kdmsg_state_t *state);
79 static void kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error);
80 static void kdmsg_state_abort(kdmsg_state_t *state);
81 static void kdmsg_state_dying(kdmsg_state_t *state);
82 static void kdmsg_state_free(kdmsg_state_t *state);
83
84 #ifdef KDMSG_DEBUG
85 #define KDMSG_DEBUG_ARGS        , const char *file, int line
86 #define kdmsg_state_hold(state) _kdmsg_state_hold(state, __FILE__, __LINE__)
87 #define kdmsg_state_drop(state) _kdmsg_state_drop(state, __FILE__, __LINE__)
88 #else
89 #define KDMSG_DEBUG 0
90 #define KDMSG_DEBUG_ARGS
91 #define kdmsg_state_hold(state) _kdmsg_state_hold(state)
92 #define kdmsg_state_drop(state) _kdmsg_state_drop(state)
93 #endif
94 static void _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
95 static void _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
96
97 static void kdmsg_iocom_thread_rd(void *arg);
98 static void kdmsg_iocom_thread_wr(void *arg);
99 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
100
101 /*static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);*/
102
103 /*
104  * Initialize the roll-up communications structure for a network
105  * messaging session.  This function does not install the socket.
106  */
107 void
108 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
109                  struct malloc_type *mmsg,
110                  int (*rcvmsg)(kdmsg_msg_t *msg))
111 {
112         bzero(iocom, sizeof(*iocom));
113         iocom->handle = handle;
114         iocom->mmsg = mmsg;
115         iocom->rcvmsg = rcvmsg;
116         iocom->flags = flags;
117         lockinit(&iocom->msglk, "h2msg", 0, 0);
118         TAILQ_INIT(&iocom->msgq);
119         RB_INIT(&iocom->staterd_tree);
120         RB_INIT(&iocom->statewr_tree);
121
122         iocom->state0.iocom = iocom;
123         iocom->state0.parent = &iocom->state0;
124         TAILQ_INIT(&iocom->state0.subq);
125 }
126
127 /*
128  * [Re]connect using the passed file pointer.  The caller must ref the
129  * fp for us.  We own that ref now.
130  */
131 void
132 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
133                       const char *subsysname)
134 {
135         /*
136          * Destroy the current connection
137          */
138         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
139         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
140         while (iocom->msgrd_td || iocom->msgwr_td) {
141                 wakeup(&iocom->msg_ctl);
142                 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
143         }
144
145         /*
146          * Drop communications descriptor
147          */
148         if (iocom->msg_fp) {
149                 fdrop(iocom->msg_fp);
150                 iocom->msg_fp = NULL;
151         }
152
153         /*
154          * Setup new communications descriptor
155          */
156         iocom->msg_ctl = 0;
157         iocom->msg_fp = fp;
158         iocom->msg_seq = 0;
159         iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
160
161         lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
162                     NULL, 0, -1, "%s-msgrd", subsysname);
163         lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
164                     NULL, 0, -1, "%s-msgwr", subsysname);
165         lockmgr(&iocom->msglk, LK_RELEASE);
166 }
167
168 /*
169  * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
170  * this function to handle the state machine for LNK_CONN and LNK_SPAN.
171  */
172 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
173 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
174
175 void
176 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
177                          void (*auto_callback)(kdmsg_msg_t *msg))
178 {
179         kdmsg_msg_t *msg;
180
181         iocom->auto_callback = auto_callback;
182
183         msg = kdmsg_msg_alloc(&iocom->state0,
184                               DMSG_LNK_CONN | DMSGF_CREATE,
185                               kdmsg_lnk_conn_reply, NULL);
186         iocom->auto_lnk_conn.head = msg->any.head;
187         msg->any.lnk_conn = iocom->auto_lnk_conn;
188         iocom->conn_state = msg->state;
189         kdmsg_state_hold(msg->state);   /* iocom->conn_state */
190         kdmsg_msg_write(msg);
191 }
192
193 static
194 int
195 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
196 {
197         kdmsg_iocom_t *iocom = state->iocom;
198         kdmsg_msg_t *rmsg;
199
200         /*
201          * Upon receipt of the LNK_CONN acknowledgement initiate an
202          * automatic SPAN if we were asked to.  Used by e.g. xdisk, but
203          * not used by HAMMER2 which must manage more than one transmitted
204          * SPAN.
205          */
206         if ((msg->any.head.cmd & DMSGF_CREATE) &&
207             (iocom->flags & KDMSG_IOCOMF_AUTOTXSPAN)) {
208                 rmsg = kdmsg_msg_alloc(&iocom->state0,
209                                        DMSG_LNK_SPAN | DMSGF_CREATE,
210                                        kdmsg_lnk_span_reply, NULL);
211                 iocom->auto_lnk_span.head = rmsg->any.head;
212                 rmsg->any.lnk_span = iocom->auto_lnk_span;
213                 kdmsg_msg_write(rmsg);
214         }
215
216         /*
217          * Process shim after the CONN is acknowledged and before the CONN
218          * transaction is deleted.  For deletions this gives device drivers
219          * the ability to interlock new operations on the circuit before
220          * it becomes illegal and panics.
221          */
222         if (iocom->auto_callback)
223                 iocom->auto_callback(msg);
224
225         if ((state->txcmd & DMSGF_DELETE) == 0 &&
226             (msg->any.head.cmd & DMSGF_DELETE)) {
227                 /*
228                  * iocom->conn_state has a state ref, drop it when clearing.
229                  */
230                 if (iocom->conn_state)
231                         kdmsg_state_drop(iocom->conn_state);
232                 iocom->conn_state = NULL;
233                 kdmsg_msg_reply(msg, 0);
234         }
235
236         return (0);
237 }
238
239 static
240 int
241 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
242 {
243         /*
244          * Be sure to process shim before terminating the SPAN
245          * transaction.  Gives device drivers the ability to
246          * interlock new operations on the circuit before it
247          * becomes illegal and panics.
248          */
249         if (state->iocom->auto_callback)
250                 state->iocom->auto_callback(msg);
251
252         if ((state->txcmd & DMSGF_DELETE) == 0 &&
253             (msg->any.head.cmd & DMSGF_DELETE)) {
254                 kdmsg_msg_reply(msg, 0);
255         }
256         return (0);
257 }
258
259 /*
260  * Disconnect and clean up
261  */
262 void
263 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
264 {
265         kdmsg_state_t *state;
266         kdmsg_msg_t *msg;
267         int retries;
268
269         /*
270          * Ask the cluster controller to go away by setting
271          * KILLRX.  Send a PING to get a response to unstick reading
272          * from the pipe.
273          *
274          * After 10 seconds shitcan the pipe and do an unclean shutdown.
275          */
276         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
277
278         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
279         msg = kdmsg_msg_alloc(&iocom->state0, DMSG_LNK_PING, NULL, NULL);
280         kdmsg_msg_write_locked(iocom, msg);
281
282         retries = 10;
283         while (iocom->msgrd_td || iocom->msgwr_td) {
284                 wakeup(&iocom->msg_ctl);
285                 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
286                 if (--retries == 0 && iocom->msg_fp) {
287                         kdio_printf(iocom, 0, "%s\n",
288                                     "iocom_uninit: "
289                                     "shitcanning unresponsive pipe");
290                         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
291                         /* retries allowed to go negative, keep looping */
292                 }
293         }
294
295         /*
296          * Cleanup caches
297          */
298         if ((state = iocom->freerd_state) != NULL) {
299                 iocom->freerd_state = NULL;
300                 kdmsg_state_drop(state);
301         }
302
303         if ((state = iocom->freewr_state) != NULL) {
304                 iocom->freewr_state = NULL;
305                 kdmsg_state_drop(state);
306         }
307
308         /*
309          * Drop communications descriptor
310          */
311         if (iocom->msg_fp) {
312                 fdrop(iocom->msg_fp);
313                 iocom->msg_fp = NULL;
314         }
315         lockmgr(&iocom->msglk, LK_RELEASE);
316 }
317
318 /*
319  * Cluster controller thread.  Perform messaging functions.  We have one
320  * thread for the reader and one for the writer.  The writer handles
321  * shutdown requests (which should break the reader thread).
322  */
323 static
324 void
325 kdmsg_iocom_thread_rd(void *arg)
326 {
327         kdmsg_iocom_t *iocom = arg;
328         dmsg_hdr_t hdr;
329         kdmsg_msg_t *msg = NULL;
330         size_t hbytes;
331         size_t abytes;
332         int error = 0;
333
334         while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLRX) == 0) {
335                 /*
336                  * Retrieve the message from the pipe or socket.
337                  */
338                 error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
339                                 NULL, 1, UIO_SYSSPACE);
340                 if (error)
341                         break;
342                 if (hdr.magic != DMSG_HDR_MAGIC) {
343                         kdio_printf(iocom, 1, "bad magic: %04x\n", hdr.magic);
344                         error = EINVAL;
345                         break;
346                 }
347                 hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
348                 if (hbytes < sizeof(hdr) || hbytes > DMSG_HDR_MAX) {
349                         kdio_printf(iocom, 1, "bad header size %zd\n", hbytes);
350                         error = EINVAL;
351                         break;
352                 }
353
354                 /* XXX messy: mask cmd to avoid allocating state */
355                 msg = kdmsg_msg_alloc(&iocom->state0,
356                                       hdr.cmd & DMSGF_BASECMDMASK,
357                                       NULL, NULL);
358                 msg->any.head = hdr;
359                 msg->hdr_size = hbytes;
360                 if (hbytes > sizeof(hdr)) {
361                         error = fp_read(iocom->msg_fp, &msg->any.head + 1,
362                                         hbytes - sizeof(hdr),
363                                         NULL, 1, UIO_SYSSPACE);
364                         if (error) {
365                                 kdio_printf(iocom, 1, "%s\n",
366                                             "short msg received");
367                                 error = EINVAL;
368                                 break;
369                         }
370                 }
371                 msg->aux_size = hdr.aux_bytes;
372                 if (msg->aux_size > DMSG_AUX_MAX) {
373                         kdio_printf(iocom, 1,
374                                     "illegal msg payload size %zd\n",
375                                     msg->aux_size);
376                         error = EINVAL;
377                         break;
378                 }
379                 if (msg->aux_size) {
380                         abytes = DMSG_DOALIGN(msg->aux_size);
381                         msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
382                         msg->flags |= KDMSG_FLAG_AUXALLOC;
383                         error = fp_read(iocom->msg_fp, msg->aux_data,
384                                         abytes, NULL, 1, UIO_SYSSPACE);
385                         if (error) {
386                                 kdio_printf(iocom, 1, "%s\n",
387                                             "short msg payload received");
388                                 break;
389                         }
390                 }
391
392                 error = kdmsg_msg_receive_handling(msg);
393                 msg = NULL;
394         }
395
396 #if 0
397         kdio_printf(iocom, 1, "read thread terminating error=%d\n", error);
398 #endif
399
400         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
401         if (msg)
402                 kdmsg_msg_free(msg);
403
404         /*
405          * Shutdown the socket and set KILLRX for consistency in case the
406          * shutdown was not commanded.  Signal the transmit side to shutdown
407          * by setting KILLTX and waking it up.
408          */
409         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
410         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
411                                         KDMSG_CLUSTERCTL_KILLTX);
412         iocom->msgrd_td = NULL;
413         lockmgr(&iocom->msglk, LK_RELEASE);
414         wakeup(&iocom->msg_ctl);
415
416         /*
417          * iocom can be ripped out at any time once the lock is
418          * released with msgrd_td set to NULL.  The wakeup()s are safe but
419          * that is all.
420          */
421         wakeup(iocom);
422         lwkt_exit();
423 }
424
425 static
426 void
427 kdmsg_iocom_thread_wr(void *arg)
428 {
429         kdmsg_iocom_t *iocom = arg;
430         kdmsg_msg_t *msg;
431         ssize_t res;
432         size_t abytes;
433         int error = 0;
434         int save_ticks;
435         int didwarn;
436
437         /*
438          * Transmit loop
439          */
440         msg = NULL;
441         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
442
443         while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0 && error == 0) {
444                 /*
445                  * Sleep if no messages pending.  Interlock with flag while
446                  * holding msglk.
447                  */
448                 if (TAILQ_EMPTY(&iocom->msgq)) {
449                         atomic_set_int(&iocom->msg_ctl,
450                                        KDMSG_CLUSTERCTL_SLEEPING);
451                         lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
452                         atomic_clear_int(&iocom->msg_ctl,
453                                          KDMSG_CLUSTERCTL_SLEEPING);
454                 }
455
456                 while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
457                         /*
458                          * Remove msg from the transmit queue and do
459                          * persist and half-closed state handling.
460                          */
461                         TAILQ_REMOVE(&iocom->msgq, msg, qentry);
462
463                         error = kdmsg_state_msgtx(msg);
464                         if (error == EALREADY) {
465                                 error = 0;
466                                 kdmsg_msg_free(msg);
467                                 continue;
468                         }
469                         if (error) {
470                                 kdmsg_msg_free(msg);
471                                 break;
472                         }
473
474                         /*
475                          * Dump the message to the pipe or socket.
476                          *
477                          * We have to clean up the message as if the transmit
478                          * succeeded even if it failed.
479                          */
480                         lockmgr(&iocom->msglk, LK_RELEASE);
481                         error = fp_write(iocom->msg_fp, &msg->any,
482                                          msg->hdr_size, &res, UIO_SYSSPACE);
483                         if (error || res != msg->hdr_size) {
484                                 if (error == 0)
485                                         error = EINVAL;
486                                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
487                                 kdmsg_state_cleanuptx(msg);
488                                 break;
489                         }
490                         if (msg->aux_size) {
491                                 abytes = DMSG_DOALIGN(msg->aux_size);
492                                 error = fp_write(iocom->msg_fp,
493                                                  msg->aux_data, abytes,
494                                                  &res, UIO_SYSSPACE);
495                                 if (error || res != abytes) {
496                                         if (error == 0)
497                                                 error = EINVAL;
498                                         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
499                                         kdmsg_state_cleanuptx(msg);
500                                         break;
501                                 }
502                         }
503                         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
504                         kdmsg_state_cleanuptx(msg);
505                 }
506         }
507
508 #if 0
509         kdio_printf(iocom, 1, "write thread terminating error=%d\n", error);
510 #endif
511
512         /*
513          * Shutdown the socket and set KILLTX for consistency in case the
514          * shutdown was not commanded.  Signal the receive side to shutdown
515          * by setting KILLRX and waking it up.
516          */
517         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
518         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
519                                         KDMSG_CLUSTERCTL_KILLTX);
520         wakeup(&iocom->msg_ctl);
521
522         /*
523          * The transmit thread is responsible for final cleanups, wait
524          * for the receive side to terminate to prevent new received
525          * states from interfering with our cleanup.
526          *
527          * Do not set msgwr_td to NULL until we actually exit.
528          */
529         while (iocom->msgrd_td) {
530                 wakeup(&iocom->msg_ctl);
531                 lksleep(iocom, &iocom->msglk, 0, "clstrkt", hz);
532         }
533
534         /*
535          * We can no longer receive new messages.  We must drain the transmit
536          * message queue and simulate received messages to close anay remaining
537          * states.
538          *
539          * Loop until all the states are gone and there are no messages
540          * pending transmit.
541          */
542         save_ticks = ticks;
543         didwarn = 0;
544
545         while (TAILQ_FIRST(&iocom->msgq) ||
546                RB_ROOT(&iocom->staterd_tree) ||
547                RB_ROOT(&iocom->statewr_tree)) {
548                 /*
549                  * Simulate failure for all sub-states of state0.
550                  */
551                 kdmsg_drain_msgq(iocom);
552                 kdio_printf(iocom, 2, "%s\n",
553                             "simulate failure for all substates of state0");
554                 kdmsg_simulate_failure(&iocom->state0, 0, DMSG_ERR_LOSTLINK);
555
556                 lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz / 2);
557
558                 if ((int)(ticks - save_ticks) > hz*2 && didwarn == 0) {
559                         didwarn = 1;
560                         kdio_printf(iocom, 0,
561                                     "Warning, write thread on %p "
562                                     "still terminating\n",
563                                     iocom);
564                 }
565                 if ((int)(ticks - save_ticks) > hz*15 && didwarn == 1) {
566                         didwarn = 2;
567                         kdio_printf(iocom, 0,
568                                     "Warning, write thread on %p "
569                                     "still terminating\n",
570                                     iocom);
571                 }
572                 if ((int)(ticks - save_ticks) > hz*60) {
573                         kdio_printf(iocom, 0,
574                                     "Can't terminate: msgq %p "
575                                     "rd_tree %p wr_tree %p\n",
576                                     TAILQ_FIRST(&iocom->msgq),
577                                     RB_ROOT(&iocom->staterd_tree),
578                                     RB_ROOT(&iocom->statewr_tree));
579                         lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz * 10);
580                 }
581         }
582
583         /*
584          * Exit handling is done by the write thread.
585          */
586         iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
587         lockmgr(&iocom->msglk, LK_RELEASE);
588
589         /*
590          * The state trees had better be empty now
591          */
592         KKASSERT(RB_EMPTY(&iocom->staterd_tree));
593         KKASSERT(RB_EMPTY(&iocom->statewr_tree));
594         KKASSERT(iocom->conn_state == NULL);
595
596         if (iocom->exit_func) {
597                 /*
598                  * iocom is invalid after we call the exit function.
599                  */
600                 iocom->msgwr_td = NULL;
601                 iocom->exit_func(iocom);
602         } else {
603                 /*
604                  * iocom can be ripped out from under us once msgwr_td is
605                  * set to NULL.  The wakeup is safe.
606                  */
607                 iocom->msgwr_td = NULL;
608                 wakeup(iocom);
609         }
610         lwkt_exit();
611 }
612
613 /*
614  * This cleans out the pending transmit message queue, adjusting any
615  * persistent states properly in the process.
616  *
617  * Called with iocom locked.
618  */
619 void
620 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
621 {
622         kdmsg_msg_t *msg;
623
624         /*
625          * Clean out our pending transmit queue, executing the
626          * appropriate state adjustments.  If this tries to open
627          * any new outgoing transactions we have to loop up and
628          * clean them out.
629          */
630         while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
631                 TAILQ_REMOVE(&iocom->msgq, msg, qentry);
632                 if (kdmsg_state_msgtx(msg))
633                         kdmsg_msg_free(msg);
634                 else
635                         kdmsg_state_cleanuptx(msg);
636         }
637 }
638
639 /*
640  * Do all processing required to handle a freshly received message
641  * after its low level header has been validated.
642  *
643  * iocom is not locked.
644  */
645 static
646 int
647 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
648 {
649         kdmsg_iocom_t *iocom = msg->state->iocom;
650         int error;
651
652         /*
653          * State machine tracking, state assignment for msg,
654          * returns error and discard status.  Errors are fatal
655          * to the connection except for EALREADY which forces
656          * a discard without execution.
657          */
658         error = kdmsg_state_msgrx(msg);
659         if (msg->state->flags & KDMSG_STATE_ABORTING) {
660                 kdio_printf(iocom, 5,
661                             "kdmsg_state_abort(b): state %p rxcmd=%08x "
662                             "txcmd=%08x msgrx error %d\n",
663                             msg->state, msg->state->rxcmd,
664                             msg->state->txcmd, error);
665         }
666         if (error) {
667                 /*
668                  * Raw protocol or connection error
669                  */
670                 if (msg->state->flags & KDMSG_STATE_ABORTING)
671                         kdio_printf(iocom, 5,
672                                     "X1 state %p error %d\n",
673                                     msg->state, error);
674                 kdmsg_msg_free(msg);
675                 if (error == EALREADY)
676                         error = 0;
677         } else if (msg->state && msg->state->func) {
678                 /*
679                  * Message related to state which already has a
680                  * handling function installed for it.
681                  */
682                 if (msg->state->flags & KDMSG_STATE_ABORTING)
683                         kdio_printf(iocom, 5,
684                                     "X2 state %p func %p\n",
685                                     msg->state, msg->state->func);
686                 error = msg->state->func(msg->state, msg);
687                 kdmsg_state_cleanuprx(msg);
688         } else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
689                 if (msg->state->flags & KDMSG_STATE_ABORTING)
690                         kdio_printf(iocom, 5,
691                                     "X3 state %p\n", msg->state);
692                 error = kdmsg_autorxmsg(msg);
693                 kdmsg_state_cleanuprx(msg);
694         } else {
695                 if (msg->state->flags & KDMSG_STATE_ABORTING)
696                         kdio_printf(iocom, 5,
697                                     "X4 state %p\n", msg->state);
698                 error = iocom->rcvmsg(msg);
699                 kdmsg_state_cleanuprx(msg);
700         }
701         return error;
702 }
703
704 /*
705  * Process state tracking for a message after reception and dequeueing,
706  * prior to execution of the state callback.  The state is updated and
707  * will be removed from the RBTREE if completely closed, but the state->parent
708  * and subq linkage is not cleaned up until after the callback (see
709  * cleanuprx()).
710  *
711  * msglk is not held.
712  *
713  * NOTE: A message transaction can consist of several messages in either
714  *       direction.
715  *
716  * NOTE: The msgid is unique to the initiator, not necessarily unique for
717  *       us or for any relay or for the return direction for that matter.
718  *       That is, two sides sending a new message can use the same msgid
719  *       without colliding.
720  *
721  * --
722  *
723  * ABORT sequences work by setting the ABORT flag along with normal message
724  * state.  However, ABORTs can also be sent on half-closed messages, that is
725  * even if the command or reply side has already sent a DELETE, as long as
726  * the message has not been fully closed it can still send an ABORT+DELETE
727  * to terminate the half-closed message state.
728  *
729  * Since ABORT+DELETEs can race we silently discard ABORT's for message
730  * state which has already been fully closed.  REPLY+ABORT+DELETEs can
731  * also race, and in this situation the other side might have already
732  * initiated a new unrelated command with the same message id.  Since
733  * the abort has not set the CREATE flag the situation can be detected
734   * and the message will also be discarded.
735  *
736  * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
737  * The ABORT request is essentially integrated into the command instead
738  * of being sent later on.  In this situation the command implementation
739  * detects that CREATE and ABORT are both set (vs ABORT alone) and can
740  * special-case non-blocking operation for the command.
741  *
742  * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
743  *        to be mid-stream aborts for command/reply sequences.  ABORTs on
744  *        one-way messages are not supported.
745  *
746  * NOTE!  If a command sequence does not support aborts the ABORT flag is
747  *        simply ignored.
748  *
749  * --
750  *
751  * One-off messages (no reply expected) are sent with neither CREATE or DELETE
752  * set.  One-off messages cannot be aborted and typically aren't processed
753  * by these routines.  The REPLY bit can be used to distinguish whether a
754  * one-off message is a command or reply.  For example, one-off replies
755  * will typically just contain status updates.
756  */
757 static
758 int
759 kdmsg_state_msgrx(kdmsg_msg_t *msg)
760 {
761         kdmsg_iocom_t *iocom = msg->state->iocom;
762         kdmsg_state_t *state;
763         kdmsg_state_t *pstate;
764         kdmsg_state_t sdummy;
765         int error;
766
767         bzero(&sdummy, sizeof(sdummy)); /* avoid gcc warnings */
768
769         /*
770          * Make sure a state structure is ready to go in case we need a new
771          * one.  This is the only routine which uses freerd_state so no
772          * races are possible.
773          */
774         if ((state = iocom->freerd_state) == NULL) {
775                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
776                 state->flags = KDMSG_STATE_DYNAMIC;
777                 state->iocom = iocom;
778                 state->refs = 1;
779                 TAILQ_INIT(&state->subq);
780                 iocom->freerd_state = state;
781         }
782         state = NULL;   /* safety */
783
784         /*
785          * Lock RB tree and locate existing persistent state, if any.
786          *
787          * If received msg is a command state is on staterd_tree.
788          * If received msg is a reply state is on statewr_tree.
789          */
790         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
791
792 again:
793         if (msg->state == &iocom->state0) {
794                 sdummy.msgid = msg->any.head.msgid;
795                 sdummy.iocom = iocom;
796                 if (msg->any.head.cmd & DMSGF_REVTRANS) {
797                         state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree,
798                                         &sdummy);
799                 } else {
800                         state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree,
801                                         &sdummy);
802                 }
803
804                 /*
805                  * Set message state unconditionally.  If this is a CREATE
806                  * message this state will become the parent state and new
807                  * state will be allocated for the message state.
808                  */
809                 if (state == NULL)
810                         state = &iocom->state0;
811                 if (state->flags & KDMSG_STATE_INTERLOCK) {
812                         state->flags |= KDMSG_STATE_SIGNAL;
813                         lksleep(state, &iocom->msglk, 0, "dmrace", hz);
814                         goto again;
815                 }
816                 kdmsg_state_hold(state);
817                 kdmsg_state_drop(msg->state);   /* iocom->state0 */
818                 msg->state = state;
819         } else {
820                 state = msg->state;
821         }
822
823         /*
824          * Short-cut one-off or mid-stream messages.
825          */
826         if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
827                                   DMSGF_ABORT)) == 0) {
828                 error = 0;
829                 goto done;
830         }
831
832         /*
833          * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
834          * inside the case statements.
835          */
836         switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
837         case DMSGF_CREATE:
838         case DMSGF_CREATE | DMSGF_DELETE:
839                 /*
840                  * New persistant command received.
841                  */
842                 if (state != &iocom->state0) {
843                         kdio_printf(iocom, 1, "%s\n",
844                                     "duplicate transaction");
845                         error = EINVAL;
846                         break;
847                 }
848
849                 /*
850                  * Lookup the circuit.  The circuit is an open transaction.
851                  * the REVCIRC bit in the message tells us which side
852                  * initiated the transaction representing the circuit.
853                  */
854                 if (msg->any.head.circuit) {
855                         sdummy.msgid = msg->any.head.circuit;
856
857                         if (msg->any.head.cmd & DMSGF_REVCIRC) {
858                                 pstate = RB_FIND(kdmsg_state_tree,
859                                                  &iocom->statewr_tree,
860                                                  &sdummy);
861                         } else {
862                                 pstate = RB_FIND(kdmsg_state_tree,
863                                                  &iocom->staterd_tree,
864                                                  &sdummy);
865                         }
866                         if (pstate == NULL) {
867                                 kdio_printf(iocom, 1, "%s\n",
868                                             "missing parent in "
869                                             "stacked trans");
870                                 error = EINVAL;
871                                 break;
872                         }
873                 } else {
874                         pstate = &iocom->state0;
875                 }
876
877                 /*
878                  * Allocate new state.
879                  *
880                  * msg->state becomes the owner of the ref we inherit from
881                  * freerd_stae.
882                  */
883                 kdmsg_state_drop(state);
884                 state = iocom->freerd_state;
885                 iocom->freerd_state = NULL;
886
887                 msg->state = state;             /* inherits freerd ref */
888                 state->parent = pstate;
889                 KKASSERT(state->iocom == iocom);
890                 state->flags |= KDMSG_STATE_RBINSERTED |
891                                 KDMSG_STATE_SUBINSERTED |
892                                 KDMSG_STATE_OPPOSITE;
893                 if (TAILQ_EMPTY(&pstate->subq))
894                         kdmsg_state_hold(pstate);/* states on pstate->subq */
895                 kdmsg_state_hold(state);        /* state on pstate->subq */
896                 kdmsg_state_hold(state);        /* state on rbtree */
897                 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
898                 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
899                 state->txcmd = DMSGF_REPLY;
900                 state->msgid = msg->any.head.msgid;
901                 state->flags &= ~KDMSG_STATE_NEW;
902                 RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
903                 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
904                 error = 0;
905                 break;
906         case DMSGF_DELETE:
907                 /*
908                  * Persistent state is expected but might not exist if an
909                  * ABORT+DELETE races the close.
910                  */
911                 if (state == &iocom->state0) {
912                         if (msg->any.head.cmd & DMSGF_ABORT) {
913                                 kdio_printf(iocom, 1, "%s\n",
914                                             "msgrx: "
915                                             "state already A");
916                                 error = EALREADY;
917                         } else {
918                                 kdio_printf(iocom, 1, "%s\n",
919                                             "msgrx: no state for DELETE");
920                                 error = EINVAL;
921                         }
922                         break;
923                 }
924
925                 /*
926                  * Handle another ABORT+DELETE case if the msgid has already
927                  * been reused.
928                  */
929                 if ((state->rxcmd & DMSGF_CREATE) == 0) {
930                         if (msg->any.head.cmd & DMSGF_ABORT) {
931                                 kdio_printf(iocom, 1, "%s\n",
932                                             "msgrx: state already B");
933                                 error = EALREADY;
934                         } else {
935                                 kdio_printf(iocom, 1, "%s\n",
936                                             "msgrx: state reused for DELETE");
937                                 error = EINVAL;
938                         }
939                         break;
940                 }
941                 error = 0;
942                 break;
943         default:
944                 /*
945                  * Check for mid-stream ABORT command received, otherwise
946                  * allow.
947                  */
948                 if (msg->any.head.cmd & DMSGF_ABORT) {
949                         if (state == &iocom->state0 ||
950                             (state->rxcmd & DMSGF_CREATE) == 0) {
951                                 error = EALREADY;
952                                 break;
953                         }
954                 }
955                 error = 0;
956                 break;
957         case DMSGF_REPLY | DMSGF_CREATE:
958         case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
959                 /*
960                  * When receiving a reply with CREATE set the original
961                  * persistent state message should already exist.
962                  */
963                 if (state == &iocom->state0) {
964                         kdio_printf(iocom, 1,
965                                     "msgrx: no state match for "
966                                     "REPLY cmd=%08x msgid=%016jx\n",
967                                     msg->any.head.cmd,
968                                     (intmax_t)msg->any.head.msgid);
969                         error = EINVAL;
970                         break;
971                 }
972                 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
973                 error = 0;
974                 break;
975         case DMSGF_REPLY | DMSGF_DELETE:
976                 /*
977                  * Received REPLY+ABORT+DELETE in case where msgid has
978                  * already been fully closed, ignore the message.
979                  */
980                 if (state == &iocom->state0) {
981                         if (msg->any.head.cmd & DMSGF_ABORT) {
982                                 error = EALREADY;
983                         } else {
984                                 kdio_printf(iocom, 1, "%s\n",
985                                             "msgrx: no state match "
986                                             "for REPLY|DELETE");
987                                 error = EINVAL;
988                         }
989                         break;
990                 }
991
992                 /*
993                  * Received REPLY+ABORT+DELETE in case where msgid has
994                  * already been reused for an unrelated message,
995                  * ignore the message.
996                  */
997                 if ((state->rxcmd & DMSGF_CREATE) == 0) {
998                         if (msg->any.head.cmd & DMSGF_ABORT) {
999                                 error = EALREADY;
1000                         } else {
1001                                 kdio_printf(iocom, 1, "%s\n",
1002                                             "msgrx: state reused "
1003                                             "for REPLY|DELETE");
1004                                 error = EINVAL;
1005                         }
1006                         break;
1007                 }
1008                 error = 0;
1009                 break;
1010         case DMSGF_REPLY:
1011                 /*
1012                  * Check for mid-stream ABORT reply received to sent command.
1013                  */
1014                 if (msg->any.head.cmd & DMSGF_ABORT) {
1015                         if (state == &iocom->state0 ||
1016                             (state->rxcmd & DMSGF_CREATE) == 0) {
1017                                 error = EALREADY;
1018                                 break;
1019                         }
1020                 }
1021                 error = 0;
1022                 break;
1023         }
1024
1025         /*
1026          * Calculate the easy-switch() transactional command.  Represents
1027          * the outer-transaction command for any transaction-create or
1028          * transaction-delete, and the inner message command for any
1029          * non-transaction or inside-transaction command.  tcmd will be
1030          * set to 0 if the message state is illegal.
1031          *
1032          * The two can be told apart because outer-transaction commands
1033          * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
1034          */
1035 done:
1036         if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
1037                 if (state != &iocom->state0) {
1038                         msg->tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1039                                     (msg->any.head.cmd & (DMSGF_CREATE |
1040                                                           DMSGF_DELETE |
1041                                                           DMSGF_REPLY));
1042                 } else {
1043                         msg->tcmd = 0;
1044                 }
1045         } else {
1046                 msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
1047         }
1048
1049         /*
1050          * Adjust the state for DELETE handling now, before making the
1051          * callback so we are atomic with other state updates.
1052          *
1053          * Subq/parent linkages are cleaned up after the callback.
1054          * If an error occurred the message is ignored and state is not
1055          * updated.
1056          */
1057         if ((state = msg->state) == NULL || error != 0) {
1058                 kdio_printf(iocom, 1,
1059                             "msgrx: state=%p error %d\n",
1060                             state, error);
1061         } else if (msg->any.head.cmd & DMSGF_DELETE) {
1062                 KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1063                 state->rxcmd |= DMSGF_DELETE;
1064                 if (state->txcmd & DMSGF_DELETE) {
1065                         KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1066                         if (state->rxcmd & DMSGF_REPLY) {
1067                                 KKASSERT(msg->any.head.cmd &
1068                                          DMSGF_REPLY);
1069                                 RB_REMOVE(kdmsg_state_tree,
1070                                           &iocom->statewr_tree, state);
1071                         } else {
1072                                 KKASSERT((msg->any.head.cmd &
1073                                           DMSGF_REPLY) == 0);
1074                                 RB_REMOVE(kdmsg_state_tree,
1075                                           &iocom->staterd_tree, state);
1076                         }
1077                         state->flags &= ~KDMSG_STATE_RBINSERTED;
1078                         kdmsg_state_drop(state);        /* state on rbtree */
1079                 }
1080         }
1081         lockmgr(&iocom->msglk, LK_RELEASE);
1082
1083         return (error);
1084 }
1085
1086 /*
1087  * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
1088  * This routine must call iocom->rcvmsg() for anything not automatically
1089  * handled.
1090  */
1091 static int
1092 kdmsg_autorxmsg(kdmsg_msg_t *msg)
1093 {
1094         kdmsg_iocom_t *iocom = msg->state->iocom;
1095         kdmsg_msg_t *rep;
1096         int error = 0;
1097         uint32_t cmd;
1098
1099         /*
1100          * Main switch processes transaction create/delete sequences only.
1101          * Use icmd (DELETEs use DMSG_LNK_ERROR
1102          *
1103          * NOTE: If processing in-transaction messages you generally want
1104          *       an inner switch on msg->any.head.cmd.
1105          */
1106         if (msg->state) {
1107                 cmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1108                       (msg->any.head.cmd & (DMSGF_CREATE |
1109                                             DMSGF_DELETE |
1110                                             DMSGF_REPLY));
1111         } else {
1112                 cmd = 0;
1113         }
1114
1115         switch(cmd) {
1116         case DMSG_LNK_PING:
1117                 /*
1118                  * Received ping, send reply
1119                  */
1120                 rep = kdmsg_msg_alloc(msg->state, DMSG_LNK_PING | DMSGF_REPLY,
1121                                       NULL, NULL);
1122                 kdmsg_msg_write(rep);
1123                 break;
1124         case DMSG_LNK_PING | DMSGF_REPLY:
1125                 /* ignore replies */
1126                 break;
1127         case DMSG_LNK_CONN | DMSGF_CREATE:
1128         case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
1129                 /*
1130                  * Received LNK_CONN transaction.  Transmit response and
1131                  * leave transaction open, which allows the other end to
1132                  * start to the SPAN protocol.
1133                  *
1134                  * Handle shim after acknowledging the CONN.
1135                  */
1136                 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1137                         if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1138                                 kdmsg_msg_result(msg, 0);
1139                                 if (iocom->auto_callback)
1140                                         iocom->auto_callback(msg);
1141                         } else {
1142                                 error = iocom->rcvmsg(msg);
1143                         }
1144                         break;
1145                 }
1146                 /* fall through */
1147         case DMSG_LNK_CONN | DMSGF_DELETE:
1148                 /*
1149                  * This message is usually simulated after a link is lost
1150                  * to clean up the transaction.
1151                  */
1152                 if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1153                         if (iocom->auto_callback)
1154                                 iocom->auto_callback(msg);
1155                         kdmsg_msg_reply(msg, 0);
1156                 } else {
1157                         error = iocom->rcvmsg(msg);
1158                 }
1159                 break;
1160         case DMSG_LNK_SPAN | DMSGF_CREATE:
1161         case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1162                 /*
1163                  * Received LNK_SPAN transaction.  We do not have to respond
1164                  * (except on termination), but we must leave the transaction
1165                  * open.
1166                  *
1167                  * Handle shim after acknowledging the SPAN.
1168                  */
1169                 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1170                         if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1171                                 if (iocom->auto_callback)
1172                                         iocom->auto_callback(msg);
1173                                 break;
1174                         }
1175                         /* fall through */
1176                 } else {
1177                         error = iocom->rcvmsg(msg);
1178                         break;
1179                 }
1180                 /* fall through */
1181         case DMSG_LNK_SPAN | DMSGF_DELETE:
1182                 /*
1183                  * Process shims (auto_callback) before cleaning up the
1184                  * circuit structure and closing the transactions.  Device
1185                  * driver should ensure that the circuit is not used after
1186                  * the auto_callback() returns.
1187                  *
1188                  * Handle shim before closing the SPAN transaction.
1189                  */
1190                 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1191                         if (iocom->auto_callback)
1192                                 iocom->auto_callback(msg);
1193                         kdmsg_msg_reply(msg, 0);
1194                 } else {
1195                         error = iocom->rcvmsg(msg);
1196                 }
1197                 break;
1198         default:
1199                 /*
1200                  * Anything unhandled goes into rcvmsg.
1201                  *
1202                  * NOTE: Replies to link-level messages initiated by our side
1203                  *       are handled by the state callback, they are NOT
1204                  *       handled here.
1205                  */
1206                 error = iocom->rcvmsg(msg);
1207                 break;
1208         }
1209         return (error);
1210 }
1211
1212 /*
1213  * Post-receive-handling message and state cleanup.  This routine is called
1214  * after the state function handling/callback to properly dispose of the
1215  * message and unlink the state's parent/subq linkage if the state is
1216  * completely closed.
1217  *
1218  * msglk is not held.
1219  */
1220 static
1221 void
1222 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1223 {
1224         kdmsg_state_t *state = msg->state;
1225         kdmsg_iocom_t *iocom = state->iocom;
1226
1227         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1228         if (state != &iocom->state0) {
1229                 /*
1230                  * When terminating a transaction (in either direction), all
1231                  * sub-states are aborted.
1232                  */
1233                 if ((msg->any.head.cmd & DMSGF_DELETE) &&
1234                     TAILQ_FIRST(&msg->state->subq)) {
1235                         kdio_printf(iocom, 2,
1236                                     "simulate failure for substates of "
1237                                     "state %p cmd %08x/%08x\n",
1238                                     msg->state,
1239                                     msg->state->rxcmd,
1240                                     msg->state->txcmd);
1241                         kdmsg_simulate_failure(msg->state,
1242                                                0, DMSG_ERR_LOSTLINK);
1243                 }
1244
1245                 /*
1246                  * Once the state is fully closed we can (try to) remove it
1247                  * from the subq topology.
1248                  */
1249                 if ((state->flags & KDMSG_STATE_SUBINSERTED) &&
1250                     (state->rxcmd & DMSGF_DELETE) &&
1251                     (state->txcmd & DMSGF_DELETE)) {
1252                         /* 
1253                          * Remove parent linkage if state is completely closed.
1254                          */
1255                         kdmsg_subq_delete(state);
1256                 }
1257         }
1258         kdmsg_msg_free(msg);
1259
1260         lockmgr(&iocom->msglk, LK_RELEASE);
1261 }
1262
1263 /*
1264  * Remove state from its parent's subq.  This can wind up recursively
1265  * dropping the parent upward.
1266  *
1267  * NOTE: Once we drop the parent, our pstate pointer may become invalid.
1268  */
1269 static
1270 void
1271 kdmsg_subq_delete(kdmsg_state_t *state)
1272 {
1273         kdmsg_state_t *pstate;
1274
1275         if (state->flags & KDMSG_STATE_SUBINSERTED) {
1276                 pstate = state->parent;
1277                 KKASSERT(pstate);
1278                 if (pstate->scan == state)
1279                         pstate->scan = NULL;
1280                 TAILQ_REMOVE(&pstate->subq, state, entry);
1281                 state->flags &= ~KDMSG_STATE_SUBINSERTED;
1282                 state->parent = NULL;
1283                 if (TAILQ_EMPTY(&pstate->subq)) {
1284                         kdmsg_state_drop(pstate);/* pstate->subq */
1285                 }
1286                 pstate = NULL;                   /* safety */
1287                 kdmsg_state_drop(state);         /* pstate->subq */
1288         } else {
1289                 KKASSERT(state->parent == NULL);
1290         }
1291 }
1292
1293 /*
1294  * Simulate receiving a message which terminates an active transaction
1295  * state.  Our simulated received message must set DELETE and may also
1296  * have to set CREATE.  It must also ensure that all fields are set such
1297  * that the receive handling code can find the state (kdmsg_state_msgrx())
1298  * or an endless loop will ensue.
1299  *
1300  * This is used when the other end of the link is dead so the device driver
1301  * gets a completed transaction for all pending states.
1302  *
1303  * Called with iocom locked.
1304  */
1305 static
1306 void
1307 kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error)
1308 {
1309         kdmsg_state_t *substate;
1310
1311         kdmsg_state_hold(state);                /* aborting */
1312
1313         /*
1314          * Abort parent state first. Parent will not actually disappear
1315          * until children are gone.  Device drivers must handle the situation.
1316          * The advantage of this is that device drivers can flag the situation
1317          * as an interlock against new operations on dying states.  And since
1318          * device operations are often asynchronous anyway, this sequence of
1319          * events works out better.
1320          */
1321         if (meto)
1322                 kdmsg_state_abort(state);
1323
1324         /*
1325          * Recurse through any children.
1326          */
1327 again:
1328         TAILQ_FOREACH(substate, &state->subq, entry) {
1329                 if (substate->flags & KDMSG_STATE_ABORTING)
1330                         continue;
1331                 state->scan = substate;
1332                 kdmsg_simulate_failure(substate, 1, error);
1333                 if (state->scan != substate)
1334                         goto again;
1335         }
1336         kdmsg_state_drop(state);                /* aborting */
1337 }
1338
1339 static
1340 void
1341 kdmsg_state_abort(kdmsg_state_t *state)
1342 {
1343         kdmsg_msg_t *msg;
1344
1345         /*
1346          * Set ABORTING and DYING, return if already set.  If the state was
1347          * just allocated we defer the abort operation until the related
1348          * message is processed.
1349          */
1350         KKASSERT((state->flags & KDMSG_STATE_ABORTING) == 0);
1351         if (state->flags & KDMSG_STATE_ABORTING)
1352                 return;
1353         state->flags |= KDMSG_STATE_ABORTING;
1354         kdmsg_state_dying(state);
1355         if (state->flags & KDMSG_STATE_NEW) {
1356                 kdio_printf(iocom, 5,
1357                             "kdmsg_state_abort(0): state %p rxcmd %08x "
1358                             "txcmd %08x flags %08x - in NEW state\n",
1359                             state, state->rxcmd,
1360                             state->txcmd, state->flags);
1361                 return;
1362         }
1363
1364         /*
1365          * NOTE: The DELETE flag might already be set due to an early
1366          *       termination.
1367          *
1368          * NOTE: Args to kdmsg_msg_alloc() to avoid dynamic state allocation.
1369          *
1370          * NOTE: We are simulating a received message using our state
1371          *       (vs a message generated by the other side using its state),
1372          *       so we must invert DMSGF_REVTRANS and DMSGF_REVCIRC.
1373          */
1374         kdio_printf(iocom, 5, 
1375                     "kdmsg_state_abort(1): state %p rxcmd %08x txcmd %08x\n",
1376                     state, state->rxcmd, state->txcmd);
1377         if ((state->rxcmd & DMSGF_DELETE) == 0) {
1378                 msg = kdmsg_msg_alloc(state, DMSG_LNK_ERROR, NULL, NULL);
1379                 if ((state->rxcmd & DMSGF_CREATE) == 0)
1380                         msg->any.head.cmd |= DMSGF_CREATE;
1381                 msg->any.head.cmd |= DMSGF_DELETE |
1382                                      (state->rxcmd & DMSGF_REPLY);
1383                 msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
1384                 msg->any.head.error = DMSG_ERR_LOSTLINK;
1385                 kdio_printf(iocom, 5,
1386                             "kdmsg_state_abort(a): state %p msgcmd %08x\n",
1387                             state, msg->any.head.cmd);
1388                 /* circuit not initialized */
1389                 lockmgr(&state->iocom->msglk, LK_RELEASE);
1390                 kdmsg_msg_receive_handling(msg);
1391                 lockmgr(&state->iocom->msglk, LK_EXCLUSIVE);
1392                 msg = NULL;
1393         }
1394         kdio_printf(iocom, 5,
1395                     "kdmsg_state_abort(2): state %p rxcmd %08x txcmd %08x\n",
1396                     state, state->rxcmd, state->txcmd);
1397 }
1398
1399 /*
1400  * Recursively sets KDMSG_STATE_DYING on state and all sub-states, preventing
1401  * the transmission of any new messages on these states.  This is done
1402  * atomically when parent state is terminating, whereas setting ABORTING is
1403  * not atomic and can leak races.
1404  */
1405 static
1406 void
1407 kdmsg_state_dying(kdmsg_state_t *state)
1408 {
1409         kdmsg_state_t *scan;
1410
1411         if ((state->flags & KDMSG_STATE_DYING) == 0) {
1412                 state->flags |= KDMSG_STATE_DYING;
1413                 TAILQ_FOREACH(scan, &state->subq, entry)
1414                         kdmsg_state_dying(scan);
1415         }
1416 }
1417
1418 /*
1419  * Process state tracking for a message prior to transmission.
1420  *
1421  * Called with msglk held and the msg dequeued.  Returns non-zero if
1422  * the message is bad and should be deleted by the caller.
1423  *
1424  * One-off messages are usually with dummy state and msg->state may be NULL
1425  * in this situation.
1426  *
1427  * New transactions (when CREATE is set) will insert the state.
1428  *
1429  * May request that caller discard the message by setting *discardp to 1.
1430  * A NULL state may be returned in this case.
1431  */
1432 static
1433 int
1434 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1435 {
1436         kdmsg_iocom_t *iocom = msg->state->iocom;
1437         kdmsg_state_t *state;
1438         int error;
1439
1440         /*
1441          * Make sure a state structure is ready to go in case we need a new
1442          * one.  This is the only routine which uses freewr_state so no
1443          * races are possible.
1444          */
1445         if ((state = iocom->freewr_state) == NULL) {
1446                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1447                 state->flags = KDMSG_STATE_DYNAMIC;
1448                 state->iocom = iocom;
1449                 state->refs = 1;
1450                 TAILQ_INIT(&state->subq);
1451                 iocom->freewr_state = state;
1452         }
1453
1454         /*
1455          * Lock RB tree.  If persistent state is present it will have already
1456          * been assigned to msg.
1457          */
1458         state = msg->state;
1459
1460         /*
1461          * Short-cut one-off or mid-stream messages (state may be NULL).
1462          */
1463         if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1464                                   DMSGF_ABORT)) == 0) {
1465                 return(0);
1466         }
1467
1468
1469         /*
1470          * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1471          * inside the case statements.
1472          */
1473         switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1474                                     DMSGF_REPLY)) {
1475         case DMSGF_CREATE:
1476         case DMSGF_CREATE | DMSGF_DELETE:
1477                 /*
1478                  * Insert the new persistent message state and mark
1479                  * half-closed if DELETE is set.  Since this is a new
1480                  * message it isn't possible to transition into the fully
1481                  * closed state here.
1482                  *
1483                  * XXX state must be assigned and inserted by
1484                  *     kdmsg_msg_write().  txcmd is assigned by us
1485                  *     on-transmit.
1486                  */
1487                 KKASSERT(state != NULL);
1488                 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1489                 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1490                 state->rxcmd = DMSGF_REPLY;
1491                 state->flags &= ~KDMSG_STATE_NEW;
1492                 error = 0;
1493                 break;
1494         case DMSGF_DELETE:
1495                 /*
1496                  * Sent ABORT+DELETE in case where msgid has already
1497                  * been fully closed, ignore the message.
1498                  */
1499                 if (state == &iocom->state0) {
1500                         if (msg->any.head.cmd & DMSGF_ABORT) {
1501                                 error = EALREADY;
1502                         } else {
1503                                 kdio_printf(iocom, 1,
1504                                         "msgtx: no state match "
1505                                         "for DELETE cmd=%08x msgid=%016jx\n",
1506                                         msg->any.head.cmd,
1507                                         (intmax_t)msg->any.head.msgid);
1508                                 error = EINVAL;
1509                         }
1510                         break;
1511                 }
1512
1513                 /*
1514                  * Sent ABORT+DELETE in case where msgid has
1515                  * already been reused for an unrelated message,
1516                  * ignore the message.
1517                  */
1518                 if ((state->txcmd & DMSGF_CREATE) == 0) {
1519                         if (msg->any.head.cmd & DMSGF_ABORT) {
1520                                 error = EALREADY;
1521                         } else {
1522                                 kdio_printf(iocom, 1, "%s\n",
1523                                             "msgtx: state reused "
1524                                             "for DELETE");
1525                                 error = EINVAL;
1526                         }
1527                         break;
1528                 }
1529                 error = 0;
1530                 break;
1531         default:
1532                 /*
1533                  * Check for mid-stream ABORT command sent
1534                  */
1535                 if (msg->any.head.cmd & DMSGF_ABORT) {
1536                         if (state == &state->iocom->state0 ||
1537                             (state->txcmd & DMSGF_CREATE) == 0) {
1538                                 error = EALREADY;
1539                                 break;
1540                         }
1541                 }
1542                 error = 0;
1543                 break;
1544         case DMSGF_REPLY | DMSGF_CREATE:
1545         case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1546                 /*
1547                  * When transmitting a reply with CREATE set the original
1548                  * persistent state message should already exist.
1549                  */
1550                 if (state == &state->iocom->state0) {
1551                         kdio_printf(iocom, 1, "%s\n",
1552                                     "msgtx: no state match "
1553                                     "for REPLY | CREATE");
1554                         error = EINVAL;
1555                         break;
1556                 }
1557                 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1558                 error = 0;
1559                 break;
1560         case DMSGF_REPLY | DMSGF_DELETE:
1561                 /*
1562                  * When transmitting a reply with DELETE set the original
1563                  * persistent state message should already exist.
1564                  *
1565                  * This is very similar to the REPLY|CREATE|* case except
1566                  * txcmd is already stored, so we just add the DELETE flag.
1567                  *
1568                  * Sent REPLY+ABORT+DELETE in case where msgid has
1569                  * already been fully closed, ignore the message.
1570                  */
1571                 if (state == &state->iocom->state0) {
1572                         if (msg->any.head.cmd & DMSGF_ABORT) {
1573                                 error = EALREADY;
1574                         } else {
1575                                 kdio_printf(iocom, 1, "%s\n",
1576                                             "msgtx: no state match "
1577                                             "for REPLY | DELETE");
1578                                 error = EINVAL;
1579                         }
1580                         break;
1581                 }
1582
1583                 /*
1584                  * Sent REPLY+ABORT+DELETE in case where msgid has already
1585                  * been reused for an unrelated message, ignore the message.
1586                  */
1587                 if ((state->txcmd & DMSGF_CREATE) == 0) {
1588                         if (msg->any.head.cmd & DMSGF_ABORT) {
1589                                 error = EALREADY;
1590                         } else {
1591                                 kdio_printf(iocom, 1, "%s\n",
1592                                             "msgtx: state reused "
1593                                             "for REPLY | DELETE");
1594                                 error = EINVAL;
1595                         }
1596                         break;
1597                 }
1598                 error = 0;
1599                 break;
1600         case DMSGF_REPLY:
1601                 /*
1602                  * Check for mid-stream ABORT reply sent.
1603                  *
1604                  * One-off REPLY messages are allowed for e.g. status updates.
1605                  */
1606                 if (msg->any.head.cmd & DMSGF_ABORT) {
1607                         if (state == &state->iocom->state0 ||
1608                             (state->txcmd & DMSGF_CREATE) == 0) {
1609                                 error = EALREADY;
1610                                 break;
1611                         }
1612                 }
1613                 error = 0;
1614                 break;
1615         }
1616
1617         /*
1618          * Set interlock (XXX hack) in case the send side blocks and a
1619          * response is returned before kdmsg_state_cleanuptx() can be
1620          * run.
1621          */
1622         if (state && error == 0)
1623                 state->flags |= KDMSG_STATE_INTERLOCK;
1624
1625         return (error);
1626 }
1627
1628 /*
1629  * Called with iocom locked.
1630  */
1631 static
1632 void
1633 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1634 {
1635         kdmsg_iocom_t *iocom = msg->state->iocom;
1636         kdmsg_state_t *state;
1637
1638         if ((state = msg->state) == NULL) {
1639                 kdmsg_msg_free(msg);
1640                 return;
1641         }
1642
1643         /*
1644          * Clear interlock (XXX hack) in case the send side blocks and a
1645          * response is returned in the other thread before
1646          * kdmsg_state_cleanuptx() can be run.  We maintain our hold on
1647          * iocom->msglk so we can do this before completing our task.
1648          */
1649         if (state->flags & KDMSG_STATE_SIGNAL) {
1650                 kdio_printf(iocom, 1, "state %p interlock!\n", state);
1651                 wakeup(state);
1652         }
1653         state->flags &= ~(KDMSG_STATE_INTERLOCK | KDMSG_STATE_SIGNAL);
1654         kdmsg_state_hold(state);
1655
1656         if (msg->any.head.cmd & DMSGF_DELETE) {
1657                 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1658                 state->txcmd |= DMSGF_DELETE;
1659                 if (state->rxcmd & DMSGF_DELETE) {
1660                         KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1661                         if (state->txcmd & DMSGF_REPLY) {
1662                                 KKASSERT(msg->any.head.cmd &
1663                                          DMSGF_REPLY);
1664                                 RB_REMOVE(kdmsg_state_tree,
1665                                           &iocom->staterd_tree, state);
1666                         } else {
1667                                 KKASSERT((msg->any.head.cmd &
1668                                           DMSGF_REPLY) == 0);
1669                                 RB_REMOVE(kdmsg_state_tree,
1670                                           &iocom->statewr_tree, state);
1671                         }
1672                         state->flags &= ~KDMSG_STATE_RBINSERTED;
1673
1674                         /*
1675                          * The subq recursion is used for parent linking and
1676                          * scanning the topology for aborts, we can only
1677                          * remove leafs.  The circuit is effectively dead now,
1678                          * but topology won't be torn down until all of its
1679                          * children have finished/aborted.
1680                          *
1681                          * This is particularly important for end-point
1682                          * devices which might need to access private data
1683                          * in parent states.  Out of order disconnects can
1684                          * occur if an end-point device is processing a
1685                          * message transaction asynchronously because abort
1686                          * requests are basically synchronous and it probably
1687                          * isn't convenient (or possible) for the end-point
1688                          * to abort an asynchronous operation.
1689                          */
1690                         if (TAILQ_EMPTY(&state->subq))
1691                                 kdmsg_subq_delete(state);
1692                         kdmsg_msg_free(msg);
1693                         kdmsg_state_drop(state);   /* state on rbtree */
1694                 } else {
1695                         kdmsg_msg_free(msg);
1696                 }
1697         } else {
1698                 kdmsg_msg_free(msg);
1699         }
1700
1701         /*
1702          * Deferred abort after transmission.
1703          */
1704         if ((state->flags & (KDMSG_STATE_ABORTING | KDMSG_STATE_DYING)) &&
1705             (state->rxcmd & DMSGF_DELETE) == 0) {
1706                 kdio_printf(iocom, 5,
1707                             "kdmsg_state_cleanuptx: state=%p "
1708                             "executing deferred abort\n",
1709                             state);
1710                 state->flags &= ~KDMSG_STATE_ABORTING;
1711                 kdmsg_state_abort(state);
1712         }
1713         kdmsg_state_drop(state);
1714 }
1715
1716 static
1717 void
1718 _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1719 {
1720         atomic_add_int(&state->refs, 1);
1721 #if KDMSG_DEBUG
1722         kd_printf(4, "state %p +%d\t%s:%d\n", state, state->refs, file, line);
1723 #endif
1724 }
1725
1726 static
1727 void
1728 _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1729 {
1730         KKASSERT(state->refs > 0);
1731 #if KDMSG_DEBUG
1732         kd_printf(4, "state %p -%d\t%s:%d\n", state, state->refs, file, line);
1733 #endif
1734         if (atomic_fetchadd_int(&state->refs, -1) == 1)
1735                 kdmsg_state_free(state);
1736 }
1737
1738 static
1739 void
1740 kdmsg_state_free(kdmsg_state_t *state)
1741 {
1742         kdmsg_iocom_t *iocom = state->iocom;
1743
1744         KKASSERT((state->flags & KDMSG_STATE_RBINSERTED) == 0);
1745         KKASSERT((state->flags & KDMSG_STATE_SUBINSERTED) == 0);
1746         KKASSERT(TAILQ_EMPTY(&state->subq));
1747
1748         if (state != &state->iocom->state0)
1749                 kfree(state, iocom->mmsg);
1750 }
1751
1752 kdmsg_msg_t *
1753 kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
1754                 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1755 {
1756         kdmsg_iocom_t *iocom = state->iocom;
1757         kdmsg_state_t *pstate;
1758         kdmsg_msg_t *msg;
1759         size_t hbytes;
1760
1761         KKASSERT(iocom != NULL);
1762         hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1763         msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1764                       iocom->mmsg, M_WAITOK | M_ZERO);
1765         msg->hdr_size = hbytes;
1766
1767         if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
1768                 /*
1769                  * New transaction, requires tracking state and a unique
1770                  * msgid to be allocated.
1771                  *
1772                  * It is possible to race a circuit failure, inherit the
1773                  * parent's STATE_DYING flag to trigger an abort sequence
1774                  * in the transmit path.  By not inheriting ABORTING the
1775                  * abort sequence can recurse.
1776                  *
1777                  * NOTE: The transactions has not yet been initiated so we
1778                  *       cannot set DMSGF_CREATE/DELETE bits in txcmd or rxcmd.
1779                  *       We have to properly setup DMSGF_REPLY, however.
1780                  */
1781                 pstate = state;
1782                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1783                 TAILQ_INIT(&state->subq);
1784                 state->iocom = iocom;
1785                 state->parent = pstate;
1786                 state->flags = KDMSG_STATE_DYNAMIC |
1787                                KDMSG_STATE_NEW;
1788                 state->func = func;
1789                 state->any.any = data;
1790                 state->msgid = (uint64_t)(uintptr_t)state;
1791                 /*msg->any.head.msgid = state->msgid;XXX*/
1792
1793                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1794                 if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1795                         panic("duplicate msgid allocated");
1796                 if (TAILQ_EMPTY(&pstate->subq))
1797                         kdmsg_state_hold(pstate);/* pstate->subq */
1798                 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
1799                 state->flags |= KDMSG_STATE_RBINSERTED |
1800                                 KDMSG_STATE_SUBINSERTED;
1801                 state->flags |= pstate->flags & KDMSG_STATE_DYING;
1802                 kdmsg_state_hold(state);        /* pstate->subq */
1803                 kdmsg_state_hold(state);        /* state on rbtree */
1804                 kdmsg_state_hold(state);        /* msg->state */
1805                 lockmgr(&iocom->msglk, LK_RELEASE);
1806         } else {
1807                 pstate = state->parent;
1808                 KKASSERT(pstate != NULL);
1809                 kdmsg_state_hold(state);        /* msg->state */
1810         }
1811
1812         if (state->flags & KDMSG_STATE_OPPOSITE)
1813                 cmd |= DMSGF_REVTRANS;
1814         if (pstate->flags & KDMSG_STATE_OPPOSITE)
1815                 cmd |= DMSGF_REVCIRC;
1816
1817         msg->any.head.magic = DMSG_HDR_MAGIC;
1818         msg->any.head.cmd = cmd;
1819         msg->any.head.msgid = state->msgid;
1820         msg->any.head.circuit = pstate->msgid;
1821         msg->state = state;
1822
1823         return (msg);
1824 }
1825
1826 void
1827 kdmsg_msg_free(kdmsg_msg_t *msg)
1828 {
1829         kdmsg_iocom_t *iocom = msg->state->iocom;
1830         kdmsg_state_t *state;
1831
1832         if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1833             msg->aux_data && msg->aux_size) {
1834                 kfree(msg->aux_data, iocom->mmsg);
1835                 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1836         }
1837         if ((state = msg->state) != NULL) {
1838                 msg->state = NULL;
1839                 kdmsg_state_drop(state);        /* msg->state */
1840         }
1841         msg->aux_data = NULL;
1842         msg->aux_size = 0;
1843
1844         kfree(msg, iocom->mmsg);
1845 }
1846
1847 void
1848 kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data)
1849 {
1850         if (msg->flags & KDMSG_FLAG_AUXALLOC) {
1851                 data->aux_data = msg->aux_data;
1852                 data->aux_size = msg->aux_size;
1853                 data->iocom = msg->state->iocom;
1854                 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1855         } else {
1856                 data->aux_data = NULL;
1857                 data->aux_size = 0;
1858                 data->iocom = msg->state->iocom;
1859         }
1860 }
1861
1862 void
1863 kdmsg_free_aux_data(kdmsg_data_t *data)
1864 {
1865         if (data->aux_data)
1866                 kfree(data->aux_data, data->iocom->mmsg);
1867 }
1868
1869 /*
1870  * Indexed messages are stored in a red-black tree indexed by their
1871  * msgid.  Only persistent messages are indexed.
1872  */
1873 int
1874 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1875 {
1876         if (state1->iocom < state2->iocom)
1877                 return(-1);
1878         if (state1->iocom > state2->iocom)
1879                 return(1);
1880         if (state1->msgid < state2->msgid)
1881                 return(-1);
1882         if (state1->msgid > state2->msgid)
1883                 return(1);
1884         return(0);
1885 }
1886
1887 /*
1888  * Write a message.  All requisit command flags have been set.
1889  *
1890  * If msg->state is non-NULL the message is written to the existing
1891  * transaction.  msgid will be set accordingly.
1892  *
1893  * If msg->state is NULL and CREATE is set new state is allocated and
1894  * (func, data) is installed.  A msgid is assigned.
1895  *
1896  * If msg->state is NULL and CREATE is not set the message is assumed
1897  * to be a one-way message.  The originator must assign the msgid
1898  * (or leave it 0, which is typical.
1899  *
1900  * This function merely queues the message to the management thread, it
1901  * does not write to the message socket/pipe.
1902  */
1903 void
1904 kdmsg_msg_write(kdmsg_msg_t *msg)
1905 {
1906         kdmsg_iocom_t *iocom = msg->state->iocom;
1907
1908         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1909         kdmsg_msg_write_locked(iocom, msg);
1910         lockmgr(&iocom->msglk, LK_RELEASE);
1911 }
1912
1913 static void
1914 kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg)
1915 {
1916         kdmsg_state_t *state;
1917
1918         if (msg->state) {
1919                 /*
1920                  * Continuance or termination of existing transaction.
1921                  * The transaction could have been initiated by either end.
1922                  *
1923                  * (Function callback and aux data for the receive side can
1924                  * be replaced or left alone).
1925                  */
1926                 state = msg->state;
1927                 msg->any.head.msgid = state->msgid;
1928         } else {
1929                 /*
1930                  * One-off message (always uses msgid 0 to distinguish
1931                  * between a possibly lost in-transaction message due to
1932                  * competing aborts and a real one-off message?)
1933                  */
1934                 state = NULL;
1935                 msg->any.head.msgid = 0;
1936         }
1937
1938 #if 0
1939         /*
1940          * XXX removed - don't make this a panic, allow the state checks
1941          *     below to catch the situation.
1942          *
1943          * This flag is not set until after the tx thread has drained
1944          * the tx msgq and simulated responses.  After that point the
1945          * txthread is dead and can no longer simulate responses.
1946          *
1947          * Device drivers should never try to send a message once this
1948          * flag is set.  They should have detected (through the state
1949          * closures) that the link is in trouble.
1950          */
1951         if (iocom->flags & KDMSG_IOCOMF_EXITNOACC) {
1952                 lockmgr(&iocom->msglk, LK_RELEASE);
1953                 panic("kdmsg_msg_write: Attempt to write message to "
1954                       "terminated iocom\n");
1955         }
1956 #endif
1957
1958         /*
1959          * For stateful messages, if the circuit is dead or dying we have
1960          * to abort the potentially newly-created state and discard the
1961          * message.
1962          *
1963          * - We must discard the message because the other end will not
1964          *   be expecting any more messages over the dead or dying circuit
1965          *   and might not be able to receive them.
1966          *
1967          * - We abort the state by simulating a failure to generate a fake
1968          *   incoming DELETE.  This will trigger the state callback and allow
1969          *   the device to clean things up and reply, closing the outgoing
1970          *   direction and allowing the state to be freed.
1971          *
1972          * This situation occurs quite often, particularly as SPANs stabilize.
1973          * End-points must do the right thing.
1974          */
1975         if (state) {
1976                 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1977                 if (state->flags & KDMSG_STATE_DYING) {
1978 #if 0
1979                 if ((state->flags & KDMSG_STATE_DYING) ||
1980                     (state->parent->txcmd & DMSGF_DELETE) ||
1981                     (state->parent->flags & KDMSG_STATE_DYING)) {
1982 #endif
1983                         kdio_printf(iocom, 4,
1984                                     "kdmsg_msg_write: Write to dying circuit "
1985                                     "state=%p "
1986                                     "ptxcmd=%08x prxcmd=%08x flags=%08x\n",
1987                                     state,
1988                                     state->parent->rxcmd,
1989                                     state->parent->txcmd,
1990                                     state->parent->flags);
1991                         kdmsg_state_hold(state);
1992                         kdmsg_state_msgtx(msg);
1993                         kdmsg_state_cleanuptx(msg);
1994                         kdmsg_state_drop(state);
1995                         return;
1996                 }
1997         }
1998
1999         /*
2000          * Finish up the msg fields.  Note that msg->aux_size and the
2001          * aux_bytes stored in the message header represent the unaligned
2002          * (actual) bytes of data, but the buffer is sized to an aligned
2003          * size and the CRC is generated over the aligned length.
2004          */
2005         msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
2006         ++iocom->msg_seq;
2007
2008         if (msg->aux_data && msg->aux_size) {
2009                 uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
2010
2011                 msg->any.head.aux_bytes = msg->aux_size;
2012                 msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
2013         }
2014         msg->any.head.hdr_crc = 0;
2015         msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
2016
2017         TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
2018
2019         if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
2020                 atomic_clear_int(&iocom->msg_ctl,
2021                                  KDMSG_CLUSTERCTL_SLEEPING);
2022                 wakeup(&iocom->msg_ctl);
2023         }
2024 }
2025
2026 /*
2027  * Reply to a message and terminate our side of the transaction.
2028  *
2029  * If msg->state is non-NULL we are replying to a one-way message.
2030  */
2031 void
2032 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
2033 {
2034         kdmsg_state_t *state = msg->state;
2035         kdmsg_msg_t *nmsg;
2036         uint32_t cmd;
2037
2038         /*
2039          * Reply with a simple error code and terminate the transaction.
2040          */
2041         cmd = DMSG_LNK_ERROR;
2042
2043         /*
2044          * Check if our direction has even been initiated yet, set CREATE.
2045          *
2046          * Check what direction this is (command or reply direction).  Note
2047          * that txcmd might not have been initiated yet.
2048          *
2049          * If our direction has already been closed we just return without
2050          * doing anything.
2051          */
2052         if (state != &state->iocom->state0) {
2053                 if (state->txcmd & DMSGF_DELETE)
2054                         return;
2055                 if ((state->txcmd & DMSGF_CREATE) == 0)
2056                         cmd |= DMSGF_CREATE;
2057                 if (state->txcmd & DMSGF_REPLY)
2058                         cmd |= DMSGF_REPLY;
2059                 cmd |= DMSGF_DELETE;
2060         } else {
2061                 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2062                         cmd |= DMSGF_REPLY;
2063         }
2064
2065         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2066         nmsg->any.head.error = error;
2067         kdmsg_msg_write(nmsg);
2068 }
2069
2070 /*
2071  * Reply to a message and continue our side of the transaction.
2072  *
2073  * If msg->state is non-NULL we are replying to a one-way message and this
2074  * function degenerates into the same as kdmsg_msg_reply().
2075  */
2076 void
2077 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
2078 {
2079         kdmsg_state_t *state = msg->state;
2080         kdmsg_msg_t *nmsg;
2081         uint32_t cmd;
2082
2083         /*
2084          * Return a simple result code, do NOT terminate the transaction.
2085          */
2086         cmd = DMSG_LNK_ERROR;
2087
2088         /*
2089          * Check if our direction has even been initiated yet, set CREATE.
2090          *
2091          * Check what direction this is (command or reply direction).  Note
2092          * that txcmd might not have been initiated yet.
2093          *
2094          * If our direction has already been closed we just return without
2095          * doing anything.
2096          */
2097         if (state != &state->iocom->state0) {
2098                 if (state->txcmd & DMSGF_DELETE)
2099                         return;
2100                 if ((state->txcmd & DMSGF_CREATE) == 0)
2101                         cmd |= DMSGF_CREATE;
2102                 if (state->txcmd & DMSGF_REPLY)
2103                         cmd |= DMSGF_REPLY;
2104                 /* continuing transaction, do not set MSGF_DELETE */
2105         } else {
2106                 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2107                         cmd |= DMSGF_REPLY;
2108         }
2109
2110         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2111         nmsg->any.head.error = error;
2112         kdmsg_msg_write(nmsg);
2113 }
2114
2115 /*
2116  * Reply to a message and terminate our side of the transaction.
2117  *
2118  * If msg->state is non-NULL we are replying to a one-way message.
2119  */
2120 void
2121 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
2122 {
2123         kdmsg_msg_t *nmsg;
2124         uint32_t cmd;
2125
2126         /*
2127          * Reply with a simple error code and terminate the transaction.
2128          */
2129         cmd = DMSG_LNK_ERROR;
2130
2131         /*
2132          * Check if our direction has even been initiated yet, set CREATE.
2133          *
2134          * Check what direction this is (command or reply direction).  Note
2135          * that txcmd might not have been initiated yet.
2136          *
2137          * If our direction has already been closed we just return without
2138          * doing anything.
2139          */
2140         KKASSERT(state);
2141         if (state->txcmd & DMSGF_DELETE)
2142                 return;
2143         if ((state->txcmd & DMSGF_CREATE) == 0)
2144                 cmd |= DMSGF_CREATE;
2145         if (state->txcmd & DMSGF_REPLY)
2146                 cmd |= DMSGF_REPLY;
2147         cmd |= DMSGF_DELETE;
2148
2149         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2150         nmsg->any.head.error = error;
2151         kdmsg_msg_write(nmsg);
2152 }
2153
2154 /*
2155  * Reply to a message and continue our side of the transaction.
2156  *
2157  * If msg->state is non-NULL we are replying to a one-way message and this
2158  * function degenerates into the same as kdmsg_msg_reply().
2159  */
2160 void
2161 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
2162 {
2163         kdmsg_msg_t *nmsg;
2164         uint32_t cmd;
2165
2166         /*
2167          * Return a simple result code, do NOT terminate the transaction.
2168          */
2169         cmd = DMSG_LNK_ERROR;
2170
2171         /*
2172          * Check if our direction has even been initiated yet, set CREATE.
2173          *
2174          * Check what direction this is (command or reply direction).  Note
2175          * that txcmd might not have been initiated yet.
2176          *
2177          * If our direction has already been closed we just return without
2178          * doing anything.
2179          */
2180         KKASSERT(state);
2181         if (state->txcmd & DMSGF_DELETE)
2182                 return;
2183         if ((state->txcmd & DMSGF_CREATE) == 0)
2184                 cmd |= DMSGF_CREATE;
2185         if (state->txcmd & DMSGF_REPLY)
2186                 cmd |= DMSGF_REPLY;
2187         /* continuing transaction, do not set MSGF_DELETE */
2188
2189         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2190         nmsg->any.head.error = error;
2191         kdmsg_msg_write(nmsg);
2192 }