kernel - Cleanup gcc warnings
[dragonfly.git] / sys / kern / kern_dmsg.c
1 /*-
2  * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * TODO: txcmd CREATE state is deferred by tx msgq, need to calculate
36  *       a streaming response.  See subr_diskiocom()'s diskiodone().
37  */
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/sysctl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/priv.h>
53 #include <sys/thread.h>
54 #include <sys/globaldata.h>
55 #include <sys/limits.h>
56
57 #include <sys/dmsg.h>
58
59 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
60
61 SYSCTL_NODE(, OID_AUTO, kdmsg, CTLFLAG_RW, 0, "kdmsg");
62 static int kdmsg_debug = 1;
63 SYSCTL_INT(_kdmsg, OID_AUTO, debug, CTLFLAG_RW, &kdmsg_debug, 0,
64            "Set debug level for kernel dmsg layer");
65
66 #define kd_printf(level, ctl, ...)              \
67         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
68
69 #define kdio_printf(iocom, level, ctl, ...)      \
70         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
71
72 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
73 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
74 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
75 static void kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg);
76 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
77 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
78 static void kdmsg_subq_delete(kdmsg_state_t *state);
79 static void kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error);
80 static void kdmsg_state_abort(kdmsg_state_t *state);
81 static void kdmsg_state_dying(kdmsg_state_t *state);
82 static void kdmsg_state_free(kdmsg_state_t *state);
83
84 #ifdef KDMSG_DEBUG
85 #define KDMSG_DEBUG_ARGS        , const char *file, int line
86 #define kdmsg_state_hold(state) _kdmsg_state_hold(state, __FILE__, __LINE__)
87 #define kdmsg_state_drop(state) _kdmsg_state_drop(state, __FILE__, __LINE__)
88 #else
89 #define KDMSG_DEBUG_ARGS
90 #define kdmsg_state_hold(state) _kdmsg_state_hold(state)
91 #define kdmsg_state_drop(state) _kdmsg_state_drop(state)
92 #endif
93 static void _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
94 static void _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
95
96 static void kdmsg_iocom_thread_rd(void *arg);
97 static void kdmsg_iocom_thread_wr(void *arg);
98 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
99
100 /*static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);*/
101
102 /*
103  * Initialize the roll-up communications structure for a network
104  * messaging session.  This function does not install the socket.
105  */
106 void
107 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
108                  struct malloc_type *mmsg,
109                  int (*rcvmsg)(kdmsg_msg_t *msg))
110 {
111         bzero(iocom, sizeof(*iocom));
112         iocom->handle = handle;
113         iocom->mmsg = mmsg;
114         iocom->rcvmsg = rcvmsg;
115         iocom->flags = flags;
116         lockinit(&iocom->msglk, "h2msg", 0, 0);
117         TAILQ_INIT(&iocom->msgq);
118         RB_INIT(&iocom->staterd_tree);
119         RB_INIT(&iocom->statewr_tree);
120
121         iocom->state0.iocom = iocom;
122         iocom->state0.parent = &iocom->state0;
123         TAILQ_INIT(&iocom->state0.subq);
124 }
125
126 /*
127  * [Re]connect using the passed file pointer.  The caller must ref the
128  * fp for us.  We own that ref now.
129  */
130 void
131 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
132                       const char *subsysname)
133 {
134         /*
135          * Destroy the current connection
136          */
137         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
138         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
139         while (iocom->msgrd_td || iocom->msgwr_td) {
140                 wakeup(&iocom->msg_ctl);
141                 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
142         }
143
144         /*
145          * Drop communications descriptor
146          */
147         if (iocom->msg_fp) {
148                 fdrop(iocom->msg_fp);
149                 iocom->msg_fp = NULL;
150         }
151
152         /*
153          * Setup new communications descriptor
154          */
155         iocom->msg_ctl = 0;
156         iocom->msg_fp = fp;
157         iocom->msg_seq = 0;
158         iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
159
160         lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
161                     NULL, 0, -1, "%s-msgrd", subsysname);
162         lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
163                     NULL, 0, -1, "%s-msgwr", subsysname);
164         lockmgr(&iocom->msglk, LK_RELEASE);
165 }
166
167 /*
168  * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
169  * this function to handle the state machine for LNK_CONN and LNK_SPAN.
170  */
171 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
172 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
173
174 void
175 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
176                          void (*auto_callback)(kdmsg_msg_t *msg))
177 {
178         kdmsg_msg_t *msg;
179
180         iocom->auto_callback = auto_callback;
181
182         msg = kdmsg_msg_alloc(&iocom->state0,
183                               DMSG_LNK_CONN | DMSGF_CREATE,
184                               kdmsg_lnk_conn_reply, NULL);
185         iocom->auto_lnk_conn.head = msg->any.head;
186         msg->any.lnk_conn = iocom->auto_lnk_conn;
187         iocom->conn_state = msg->state;
188         kdmsg_state_hold(msg->state);   /* iocom->conn_state */
189         kdmsg_msg_write(msg);
190 }
191
192 static
193 int
194 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
195 {
196         kdmsg_iocom_t *iocom = state->iocom;
197         kdmsg_msg_t *rmsg;
198
199         /*
200          * Upon receipt of the LNK_CONN acknowledgement initiate an
201          * automatic SPAN if we were asked to.  Used by e.g. xdisk, but
202          * not used by HAMMER2 which must manage more than one transmitted
203          * SPAN.
204          */
205         if ((msg->any.head.cmd & DMSGF_CREATE) &&
206             (iocom->flags & KDMSG_IOCOMF_AUTOTXSPAN)) {
207                 rmsg = kdmsg_msg_alloc(&iocom->state0,
208                                        DMSG_LNK_SPAN | DMSGF_CREATE,
209                                        kdmsg_lnk_span_reply, NULL);
210                 iocom->auto_lnk_span.head = rmsg->any.head;
211                 rmsg->any.lnk_span = iocom->auto_lnk_span;
212                 kdmsg_msg_write(rmsg);
213         }
214
215         /*
216          * Process shim after the CONN is acknowledged and before the CONN
217          * transaction is deleted.  For deletions this gives device drivers
218          * the ability to interlock new operations on the circuit before
219          * it becomes illegal and panics.
220          */
221         if (iocom->auto_callback)
222                 iocom->auto_callback(msg);
223
224         if ((state->txcmd & DMSGF_DELETE) == 0 &&
225             (msg->any.head.cmd & DMSGF_DELETE)) {
226                 /*
227                  * iocom->conn_state has a state ref, drop it when clearing.
228                  */
229                 if (iocom->conn_state)
230                         kdmsg_state_drop(iocom->conn_state);
231                 iocom->conn_state = NULL;
232                 kdmsg_msg_reply(msg, 0);
233         }
234
235         return (0);
236 }
237
238 static
239 int
240 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
241 {
242         /*
243          * Be sure to process shim before terminating the SPAN
244          * transaction.  Gives device drivers the ability to
245          * interlock new operations on the circuit before it
246          * becomes illegal and panics.
247          */
248         if (state->iocom->auto_callback)
249                 state->iocom->auto_callback(msg);
250
251         if ((state->txcmd & DMSGF_DELETE) == 0 &&
252             (msg->any.head.cmd & DMSGF_DELETE)) {
253                 kdmsg_msg_reply(msg, 0);
254         }
255         return (0);
256 }
257
258 /*
259  * Disconnect and clean up
260  */
261 void
262 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
263 {
264         kdmsg_state_t *state;
265         kdmsg_msg_t *msg;
266         int retries;
267
268         /*
269          * Ask the cluster controller to go away by setting
270          * KILLRX.  Send a PING to get a response to unstick reading
271          * from the pipe.
272          *
273          * After 10 seconds shitcan the pipe and do an unclean shutdown.
274          */
275         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
276
277         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
278         msg = kdmsg_msg_alloc(&iocom->state0, DMSG_LNK_PING, NULL, NULL);
279         kdmsg_msg_write_locked(iocom, msg);
280
281         retries = 10;
282         while (iocom->msgrd_td || iocom->msgwr_td) {
283                 wakeup(&iocom->msg_ctl);
284                 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
285                 if (--retries == 0 && iocom->msg_fp) {
286                         kdio_printf(iocom, 0, "%s\n",
287                                     "iocom_uninit: "
288                                     "shitcanning unresponsive pipe");
289                         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
290                         /* retries allowed to go negative, keep looping */
291                 }
292         }
293
294         /*
295          * Cleanup caches
296          */
297         if ((state = iocom->freerd_state) != NULL) {
298                 iocom->freerd_state = NULL;
299                 kdmsg_state_drop(state);
300         }
301
302         if ((state = iocom->freewr_state) != NULL) {
303                 iocom->freewr_state = NULL;
304                 kdmsg_state_drop(state);
305         }
306
307         /*
308          * Drop communications descriptor
309          */
310         if (iocom->msg_fp) {
311                 fdrop(iocom->msg_fp);
312                 iocom->msg_fp = NULL;
313         }
314         lockmgr(&iocom->msglk, LK_RELEASE);
315 }
316
317 /*
318  * Cluster controller thread.  Perform messaging functions.  We have one
319  * thread for the reader and one for the writer.  The writer handles
320  * shutdown requests (which should break the reader thread).
321  */
322 static
323 void
324 kdmsg_iocom_thread_rd(void *arg)
325 {
326         kdmsg_iocom_t *iocom = arg;
327         dmsg_hdr_t hdr;
328         kdmsg_msg_t *msg = NULL;
329         size_t hbytes;
330         size_t abytes;
331         int error = 0;
332
333         while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLRX) == 0) {
334                 /*
335                  * Retrieve the message from the pipe or socket.
336                  */
337                 error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
338                                 NULL, 1, UIO_SYSSPACE);
339                 if (error)
340                         break;
341                 if (hdr.magic != DMSG_HDR_MAGIC) {
342                         kdio_printf(iocom, 1, "bad magic: %04x\n", hdr.magic);
343                         error = EINVAL;
344                         break;
345                 }
346                 hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
347                 if (hbytes < sizeof(hdr) || hbytes > DMSG_HDR_MAX) {
348                         kdio_printf(iocom, 1, "bad header size %zd\n", hbytes);
349                         error = EINVAL;
350                         break;
351                 }
352
353                 /* XXX messy: mask cmd to avoid allocating state */
354                 msg = kdmsg_msg_alloc(&iocom->state0,
355                                       hdr.cmd & DMSGF_BASECMDMASK,
356                                       NULL, NULL);
357                 msg->any.head = hdr;
358                 msg->hdr_size = hbytes;
359                 if (hbytes > sizeof(hdr)) {
360                         error = fp_read(iocom->msg_fp, &msg->any.head + 1,
361                                         hbytes - sizeof(hdr),
362                                         NULL, 1, UIO_SYSSPACE);
363                         if (error) {
364                                 kdio_printf(iocom, 1, "%s\n",
365                                             "short msg received");
366                                 error = EINVAL;
367                                 break;
368                         }
369                 }
370                 msg->aux_size = hdr.aux_bytes;
371                 if (msg->aux_size > DMSG_AUX_MAX) {
372                         kdio_printf(iocom, 1,
373                                     "illegal msg payload size %zd\n",
374                                     msg->aux_size);
375                         error = EINVAL;
376                         break;
377                 }
378                 if (msg->aux_size) {
379                         abytes = DMSG_DOALIGN(msg->aux_size);
380                         msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
381                         msg->flags |= KDMSG_FLAG_AUXALLOC;
382                         error = fp_read(iocom->msg_fp, msg->aux_data,
383                                         abytes, NULL, 1, UIO_SYSSPACE);
384                         if (error) {
385                                 kdio_printf(iocom, 1, "%s\n",
386                                             "short msg payload received");
387                                 break;
388                         }
389                 }
390
391                 error = kdmsg_msg_receive_handling(msg);
392                 msg = NULL;
393         }
394
395         kdio_printf(iocom, 1, "read thread terminating error=%d\n", error);
396
397         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
398         if (msg)
399                 kdmsg_msg_free(msg);
400
401         /*
402          * Shutdown the socket and set KILLRX for consistency in case the
403          * shutdown was not commanded.  Signal the transmit side to shutdown
404          * by setting KILLTX and waking it up.
405          */
406         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
407         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
408                                         KDMSG_CLUSTERCTL_KILLTX);
409         iocom->msgrd_td = NULL;
410         lockmgr(&iocom->msglk, LK_RELEASE);
411         wakeup(&iocom->msg_ctl);
412
413         /*
414          * iocom can be ripped out at any time once the lock is
415          * released with msgrd_td set to NULL.  The wakeup()s are safe but
416          * that is all.
417          */
418         wakeup(iocom);
419         lwkt_exit();
420 }
421
422 static
423 void
424 kdmsg_iocom_thread_wr(void *arg)
425 {
426         kdmsg_iocom_t *iocom = arg;
427         kdmsg_msg_t *msg;
428         ssize_t res;
429         size_t abytes;
430         int error = 0;
431         int save_ticks;
432         int didwarn;
433
434         /*
435          * Transmit loop
436          */
437         msg = NULL;
438         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
439
440         while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0 && error == 0) {
441                 /*
442                  * Sleep if no messages pending.  Interlock with flag while
443                  * holding msglk.
444                  */
445                 if (TAILQ_EMPTY(&iocom->msgq)) {
446                         atomic_set_int(&iocom->msg_ctl,
447                                        KDMSG_CLUSTERCTL_SLEEPING);
448                         lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
449                         atomic_clear_int(&iocom->msg_ctl,
450                                          KDMSG_CLUSTERCTL_SLEEPING);
451                 }
452
453                 while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
454                         /*
455                          * Remove msg from the transmit queue and do
456                          * persist and half-closed state handling.
457                          */
458                         TAILQ_REMOVE(&iocom->msgq, msg, qentry);
459
460                         error = kdmsg_state_msgtx(msg);
461                         if (error == EALREADY) {
462                                 error = 0;
463                                 kdmsg_msg_free(msg);
464                                 continue;
465                         }
466                         if (error) {
467                                 kdmsg_msg_free(msg);
468                                 break;
469                         }
470
471                         /*
472                          * Dump the message to the pipe or socket.
473                          *
474                          * We have to clean up the message as if the transmit
475                          * succeeded even if it failed.
476                          */
477                         lockmgr(&iocom->msglk, LK_RELEASE);
478                         error = fp_write(iocom->msg_fp, &msg->any,
479                                          msg->hdr_size, &res, UIO_SYSSPACE);
480                         if (error || res != msg->hdr_size) {
481                                 if (error == 0)
482                                         error = EINVAL;
483                                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
484                                 kdmsg_state_cleanuptx(msg);
485                                 break;
486                         }
487                         if (msg->aux_size) {
488                                 abytes = DMSG_DOALIGN(msg->aux_size);
489                                 error = fp_write(iocom->msg_fp,
490                                                  msg->aux_data, abytes,
491                                                  &res, UIO_SYSSPACE);
492                                 if (error || res != abytes) {
493                                         if (error == 0)
494                                                 error = EINVAL;
495                                         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
496                                         kdmsg_state_cleanuptx(msg);
497                                         break;
498                                 }
499                         }
500                         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
501                         kdmsg_state_cleanuptx(msg);
502                 }
503         }
504
505         kdio_printf(iocom, 1, "write thread terminating error=%d\n", error);
506
507         /*
508          * Shutdown the socket and set KILLTX for consistency in case the
509          * shutdown was not commanded.  Signal the receive side to shutdown
510          * by setting KILLRX and waking it up.
511          */
512         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
513         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
514                                         KDMSG_CLUSTERCTL_KILLTX);
515         wakeup(&iocom->msg_ctl);
516
517         /*
518          * The transmit thread is responsible for final cleanups, wait
519          * for the receive side to terminate to prevent new received
520          * states from interfering with our cleanup.
521          *
522          * Do not set msgwr_td to NULL until we actually exit.
523          */
524         while (iocom->msgrd_td) {
525                 wakeup(&iocom->msg_ctl);
526                 lksleep(iocom, &iocom->msglk, 0, "clstrkt", hz);
527         }
528
529         /*
530          * We can no longer receive new messages.  We must drain the transmit
531          * message queue and simulate received messages to close anay remaining
532          * states.
533          *
534          * Loop until all the states are gone and there are no messages
535          * pending transmit.
536          */
537         save_ticks = ticks;
538         didwarn = 0;
539
540         while (TAILQ_FIRST(&iocom->msgq) ||
541                RB_ROOT(&iocom->staterd_tree) ||
542                RB_ROOT(&iocom->statewr_tree)) {
543                 /*
544                  * Simulate failure for all sub-states of state0.
545                  */
546                 kdmsg_drain_msgq(iocom);
547                 kdio_printf(iocom, 2, "%s\n",
548                             "simulate failure for all substates of state0");
549                 kdmsg_simulate_failure(&iocom->state0, 0, DMSG_ERR_LOSTLINK);
550
551                 lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz / 2);
552
553                 if ((int)(ticks - save_ticks) > hz*2 && didwarn == 0) {
554                         didwarn = 1;
555                         kdio_printf(iocom, 0,
556                                     "Warning, write thread on %p "
557                                     "still terminating\n",
558                                     iocom);
559                 }
560                 if ((int)(ticks - save_ticks) > hz*15 && didwarn == 1) {
561                         didwarn = 2;
562                         kdio_printf(iocom, 0,
563                                     "Warning, write thread on %p "
564                                     "still terminating\n",
565                                     iocom);
566                 }
567                 if ((int)(ticks - save_ticks) > hz*60) {
568                         kdio_printf(iocom, 0,
569                                     "Can't terminate: msgq %p "
570                                     "rd_tree %p wr_tree %p\n",
571                                     TAILQ_FIRST(&iocom->msgq),
572                                     RB_ROOT(&iocom->staterd_tree),
573                                     RB_ROOT(&iocom->statewr_tree));
574                         lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz * 10);
575                 }
576         }
577
578         /*
579          * Exit handling is done by the write thread.
580          */
581         iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
582         lockmgr(&iocom->msglk, LK_RELEASE);
583
584         /*
585          * The state trees had better be empty now
586          */
587         KKASSERT(RB_EMPTY(&iocom->staterd_tree));
588         KKASSERT(RB_EMPTY(&iocom->statewr_tree));
589         KKASSERT(iocom->conn_state == NULL);
590
591         if (iocom->exit_func) {
592                 /*
593                  * iocom is invalid after we call the exit function.
594                  */
595                 iocom->msgwr_td = NULL;
596                 iocom->exit_func(iocom);
597         } else {
598                 /*
599                  * iocom can be ripped out from under us once msgwr_td is
600                  * set to NULL.  The wakeup is safe.
601                  */
602                 iocom->msgwr_td = NULL;
603                 wakeup(iocom);
604         }
605         lwkt_exit();
606 }
607
608 /*
609  * This cleans out the pending transmit message queue, adjusting any
610  * persistent states properly in the process.
611  *
612  * Called with iocom locked.
613  */
614 void
615 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
616 {
617         kdmsg_msg_t *msg;
618
619         /*
620          * Clean out our pending transmit queue, executing the
621          * appropriate state adjustments.  If this tries to open
622          * any new outgoing transactions we have to loop up and
623          * clean them out.
624          */
625         while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
626                 TAILQ_REMOVE(&iocom->msgq, msg, qentry);
627                 if (kdmsg_state_msgtx(msg))
628                         kdmsg_msg_free(msg);
629                 else
630                         kdmsg_state_cleanuptx(msg);
631         }
632 }
633
634 /*
635  * Do all processing required to handle a freshly received message
636  * after its low level header has been validated.
637  *
638  * iocom is not locked.
639  */
640 static
641 int
642 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
643 {
644         kdmsg_iocom_t *iocom = msg->state->iocom;
645         int error;
646
647         /*
648          * State machine tracking, state assignment for msg,
649          * returns error and discard status.  Errors are fatal
650          * to the connection except for EALREADY which forces
651          * a discard without execution.
652          */
653         error = kdmsg_state_msgrx(msg);
654         if (msg->state->flags & KDMSG_STATE_ABORTING) {
655                 kdio_printf(iocom, 5,
656                             "kdmsg_state_abort(b): state %p rxcmd=%08x "
657                             "txcmd=%08x msgrx error %d\n",
658                             msg->state, msg->state->rxcmd,
659                             msg->state->txcmd, error);
660         }
661         if (error) {
662                 /*
663                  * Raw protocol or connection error
664                  */
665                 if (msg->state->flags & KDMSG_STATE_ABORTING)
666                         kdio_printf(iocom, 5,
667                                     "X1 state %p error %d\n",
668                                     msg->state, error);
669                 kdmsg_msg_free(msg);
670                 if (error == EALREADY)
671                         error = 0;
672         } else if (msg->state && msg->state->func) {
673                 /*
674                  * Message related to state which already has a
675                  * handling function installed for it.
676                  */
677                 if (msg->state->flags & KDMSG_STATE_ABORTING)
678                         kdio_printf(iocom, 5,
679                                     "X2 state %p func %p\n",
680                                     msg->state, msg->state->func);
681                 error = msg->state->func(msg->state, msg);
682                 kdmsg_state_cleanuprx(msg);
683         } else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
684                 if (msg->state->flags & KDMSG_STATE_ABORTING)
685                         kdio_printf(iocom, 5,
686                                     "X3 state %p\n", msg->state);
687                 error = kdmsg_autorxmsg(msg);
688                 kdmsg_state_cleanuprx(msg);
689         } else {
690                 if (msg->state->flags & KDMSG_STATE_ABORTING)
691                         kdio_printf(iocom, 5,
692                                     "X4 state %p\n", msg->state);
693                 error = iocom->rcvmsg(msg);
694                 kdmsg_state_cleanuprx(msg);
695         }
696         return error;
697 }
698
699 /*
700  * Process state tracking for a message after reception and dequeueing,
701  * prior to execution of the state callback.  The state is updated and
702  * will be removed from the RBTREE if completely closed, but the state->parent
703  * and subq linkage is not cleaned up until after the callback (see
704  * cleanuprx()).
705  *
706  * msglk is not held.
707  *
708  * NOTE: A message transaction can consist of several messages in either
709  *       direction.
710  *
711  * NOTE: The msgid is unique to the initiator, not necessarily unique for
712  *       us or for any relay or for the return direction for that matter.
713  *       That is, two sides sending a new message can use the same msgid
714  *       without colliding.
715  *
716  * --
717  *
718  * ABORT sequences work by setting the ABORT flag along with normal message
719  * state.  However, ABORTs can also be sent on half-closed messages, that is
720  * even if the command or reply side has already sent a DELETE, as long as
721  * the message has not been fully closed it can still send an ABORT+DELETE
722  * to terminate the half-closed message state.
723  *
724  * Since ABORT+DELETEs can race we silently discard ABORT's for message
725  * state which has already been fully closed.  REPLY+ABORT+DELETEs can
726  * also race, and in this situation the other side might have already
727  * initiated a new unrelated command with the same message id.  Since
728  * the abort has not set the CREATE flag the situation can be detected
729   * and the message will also be discarded.
730  *
731  * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
732  * The ABORT request is essentially integrated into the command instead
733  * of being sent later on.  In this situation the command implementation
734  * detects that CREATE and ABORT are both set (vs ABORT alone) and can
735  * special-case non-blocking operation for the command.
736  *
737  * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
738  *        to be mid-stream aborts for command/reply sequences.  ABORTs on
739  *        one-way messages are not supported.
740  *
741  * NOTE!  If a command sequence does not support aborts the ABORT flag is
742  *        simply ignored.
743  *
744  * --
745  *
746  * One-off messages (no reply expected) are sent with neither CREATE or DELETE
747  * set.  One-off messages cannot be aborted and typically aren't processed
748  * by these routines.  The REPLY bit can be used to distinguish whether a
749  * one-off message is a command or reply.  For example, one-off replies
750  * will typically just contain status updates.
751  */
752 static
753 int
754 kdmsg_state_msgrx(kdmsg_msg_t *msg)
755 {
756         kdmsg_iocom_t *iocom = msg->state->iocom;
757         kdmsg_state_t *state;
758         kdmsg_state_t *pstate;
759         kdmsg_state_t sdummy;
760         int error;
761
762         bzero(&sdummy, sizeof(sdummy)); /* avoid gcc warnings */
763
764         /*
765          * Make sure a state structure is ready to go in case we need a new
766          * one.  This is the only routine which uses freerd_state so no
767          * races are possible.
768          */
769         if ((state = iocom->freerd_state) == NULL) {
770                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
771                 state->flags = KDMSG_STATE_DYNAMIC;
772                 state->iocom = iocom;
773                 state->refs = 1;
774                 TAILQ_INIT(&state->subq);
775                 iocom->freerd_state = state;
776         }
777         state = NULL;   /* safety */
778
779         /*
780          * Lock RB tree and locate existing persistent state, if any.
781          *
782          * If received msg is a command state is on staterd_tree.
783          * If received msg is a reply state is on statewr_tree.
784          */
785         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
786
787 again:
788         if (msg->state == &iocom->state0) {
789                 sdummy.msgid = msg->any.head.msgid;
790                 sdummy.iocom = iocom;
791                 if (msg->any.head.cmd & DMSGF_REVTRANS) {
792                         state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree,
793                                         &sdummy);
794                 } else {
795                         state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree,
796                                         &sdummy);
797                 }
798
799                 /*
800                  * Set message state unconditionally.  If this is a CREATE
801                  * message this state will become the parent state and new
802                  * state will be allocated for the message state.
803                  */
804                 if (state == NULL)
805                         state = &iocom->state0;
806                 if (state->flags & KDMSG_STATE_INTERLOCK) {
807                         state->flags |= KDMSG_STATE_SIGNAL;
808                         lksleep(state, &iocom->msglk, 0, "dmrace", hz);
809                         goto again;
810                 }
811                 kdmsg_state_hold(state);
812                 kdmsg_state_drop(msg->state);   /* iocom->state0 */
813                 msg->state = state;
814         } else {
815                 state = msg->state;
816         }
817
818         /*
819          * Short-cut one-off or mid-stream messages.
820          */
821         if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
822                                   DMSGF_ABORT)) == 0) {
823                 error = 0;
824                 goto done;
825         }
826
827         /*
828          * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
829          * inside the case statements.
830          */
831         switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
832         case DMSGF_CREATE:
833         case DMSGF_CREATE | DMSGF_DELETE:
834                 /*
835                  * New persistant command received.
836                  */
837                 if (state != &iocom->state0) {
838                         kdio_printf(iocom, 1, "%s\n",
839                                     "duplicate transaction");
840                         error = EINVAL;
841                         break;
842                 }
843
844                 /*
845                  * Lookup the circuit.  The circuit is an open transaction.
846                  * the REVCIRC bit in the message tells us which side
847                  * initiated the transaction representing the circuit.
848                  */
849                 if (msg->any.head.circuit) {
850                         sdummy.msgid = msg->any.head.circuit;
851
852                         if (msg->any.head.cmd & DMSGF_REVCIRC) {
853                                 pstate = RB_FIND(kdmsg_state_tree,
854                                                  &iocom->statewr_tree,
855                                                  &sdummy);
856                         } else {
857                                 pstate = RB_FIND(kdmsg_state_tree,
858                                                  &iocom->staterd_tree,
859                                                  &sdummy);
860                         }
861                         if (pstate == NULL) {
862                                 kdio_printf(iocom, 1, "%s\n",
863                                             "missing parent in "
864                                             "stacked trans");
865                                 error = EINVAL;
866                                 break;
867                         }
868                 } else {
869                         pstate = &iocom->state0;
870                 }
871
872                 /*
873                  * Allocate new state.
874                  *
875                  * msg->state becomes the owner of the ref we inherit from
876                  * freerd_stae.
877                  */
878                 kdmsg_state_drop(state);
879                 state = iocom->freerd_state;
880                 iocom->freerd_state = NULL;
881
882                 msg->state = state;             /* inherits freerd ref */
883                 state->parent = pstate;
884                 KKASSERT(state->iocom == iocom);
885                 state->flags |= KDMSG_STATE_RBINSERTED |
886                                 KDMSG_STATE_SUBINSERTED |
887                                 KDMSG_STATE_OPPOSITE;
888                 if (TAILQ_EMPTY(&pstate->subq))
889                         kdmsg_state_hold(pstate);/* states on pstate->subq */
890                 kdmsg_state_hold(state);        /* state on pstate->subq */
891                 kdmsg_state_hold(state);        /* state on rbtree */
892                 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
893                 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
894                 state->txcmd = DMSGF_REPLY;
895                 state->msgid = msg->any.head.msgid;
896                 state->flags &= ~KDMSG_STATE_NEW;
897                 RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
898                 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
899                 error = 0;
900                 break;
901         case DMSGF_DELETE:
902                 /*
903                  * Persistent state is expected but might not exist if an
904                  * ABORT+DELETE races the close.
905                  */
906                 if (state == &iocom->state0) {
907                         if (msg->any.head.cmd & DMSGF_ABORT) {
908                                 kdio_printf(iocom, 1, "%s\n",
909                                             "msgrx: "
910                                             "state already A");
911                                 error = EALREADY;
912                         } else {
913                                 kdio_printf(iocom, 1, "%s\n",
914                                             "msgrx: no state for DELETE");
915                                 error = EINVAL;
916                         }
917                         break;
918                 }
919
920                 /*
921                  * Handle another ABORT+DELETE case if the msgid has already
922                  * been reused.
923                  */
924                 if ((state->rxcmd & DMSGF_CREATE) == 0) {
925                         if (msg->any.head.cmd & DMSGF_ABORT) {
926                                 kdio_printf(iocom, 1, "%s\n",
927                                             "msgrx: state already B");
928                                 error = EALREADY;
929                         } else {
930                                 kdio_printf(iocom, 1, "%s\n",
931                                             "msgrx: state reused for DELETE");
932                                 error = EINVAL;
933                         }
934                         break;
935                 }
936                 error = 0;
937                 break;
938         default:
939                 /*
940                  * Check for mid-stream ABORT command received, otherwise
941                  * allow.
942                  */
943                 if (msg->any.head.cmd & DMSGF_ABORT) {
944                         if (state == &iocom->state0 ||
945                             (state->rxcmd & DMSGF_CREATE) == 0) {
946                                 error = EALREADY;
947                                 break;
948                         }
949                 }
950                 error = 0;
951                 break;
952         case DMSGF_REPLY | DMSGF_CREATE:
953         case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
954                 /*
955                  * When receiving a reply with CREATE set the original
956                  * persistent state message should already exist.
957                  */
958                 if (state == &iocom->state0) {
959                         kdio_printf(iocom, 1,
960                                     "msgrx: no state match for "
961                                     "REPLY cmd=%08x msgid=%016jx\n",
962                                     msg->any.head.cmd,
963                                     (intmax_t)msg->any.head.msgid);
964                         error = EINVAL;
965                         break;
966                 }
967                 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
968                 error = 0;
969                 break;
970         case DMSGF_REPLY | DMSGF_DELETE:
971                 /*
972                  * Received REPLY+ABORT+DELETE in case where msgid has
973                  * already been fully closed, ignore the message.
974                  */
975                 if (state == &iocom->state0) {
976                         if (msg->any.head.cmd & DMSGF_ABORT) {
977                                 error = EALREADY;
978                         } else {
979                                 kdio_printf(iocom, 1, "%s\n",
980                                             "msgrx: no state match "
981                                             "for REPLY|DELETE");
982                                 error = EINVAL;
983                         }
984                         break;
985                 }
986
987                 /*
988                  * Received REPLY+ABORT+DELETE in case where msgid has
989                  * already been reused for an unrelated message,
990                  * ignore the message.
991                  */
992                 if ((state->rxcmd & DMSGF_CREATE) == 0) {
993                         if (msg->any.head.cmd & DMSGF_ABORT) {
994                                 error = EALREADY;
995                         } else {
996                                 kdio_printf(iocom, 1, "%s\n",
997                                             "msgrx: state reused "
998                                             "for REPLY|DELETE");
999                                 error = EINVAL;
1000                         }
1001                         break;
1002                 }
1003                 error = 0;
1004                 break;
1005         case DMSGF_REPLY:
1006                 /*
1007                  * Check for mid-stream ABORT reply received to sent command.
1008                  */
1009                 if (msg->any.head.cmd & DMSGF_ABORT) {
1010                         if (state == &iocom->state0 ||
1011                             (state->rxcmd & DMSGF_CREATE) == 0) {
1012                                 error = EALREADY;
1013                                 break;
1014                         }
1015                 }
1016                 error = 0;
1017                 break;
1018         }
1019
1020         /*
1021          * Calculate the easy-switch() transactional command.  Represents
1022          * the outer-transaction command for any transaction-create or
1023          * transaction-delete, and the inner message command for any
1024          * non-transaction or inside-transaction command.  tcmd will be
1025          * set to 0 if the message state is illegal.
1026          *
1027          * The two can be told apart because outer-transaction commands
1028          * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
1029          */
1030 done:
1031         if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
1032                 if (state != &iocom->state0) {
1033                         msg->tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1034                                     (msg->any.head.cmd & (DMSGF_CREATE |
1035                                                           DMSGF_DELETE |
1036                                                           DMSGF_REPLY));
1037                 } else {
1038                         msg->tcmd = 0;
1039                 }
1040         } else {
1041                 msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
1042         }
1043
1044         /*
1045          * Adjust the state for DELETE handling now, before making the
1046          * callback so we are atomic with other state updates.
1047          *
1048          * Subq/parent linkages are cleaned up after the callback.
1049          * If an error occurred the message is ignored and state is not
1050          * updated.
1051          */
1052         if ((state = msg->state) == NULL || error != 0) {
1053                 kdio_printf(iocom, 1,
1054                             "msgrx: state=%p error %d\n",
1055                             state, error);
1056         } else if (msg->any.head.cmd & DMSGF_DELETE) {
1057                 KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1058                 state->rxcmd |= DMSGF_DELETE;
1059                 if (state->txcmd & DMSGF_DELETE) {
1060                         KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1061                         if (state->rxcmd & DMSGF_REPLY) {
1062                                 KKASSERT(msg->any.head.cmd &
1063                                          DMSGF_REPLY);
1064                                 RB_REMOVE(kdmsg_state_tree,
1065                                           &iocom->statewr_tree, state);
1066                         } else {
1067                                 KKASSERT((msg->any.head.cmd &
1068                                           DMSGF_REPLY) == 0);
1069                                 RB_REMOVE(kdmsg_state_tree,
1070                                           &iocom->staterd_tree, state);
1071                         }
1072                         state->flags &= ~KDMSG_STATE_RBINSERTED;
1073                         kdmsg_state_drop(state);        /* state on rbtree */
1074                 }
1075         }
1076         lockmgr(&iocom->msglk, LK_RELEASE);
1077
1078         return (error);
1079 }
1080
1081 /*
1082  * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
1083  * This routine must call iocom->rcvmsg() for anything not automatically
1084  * handled.
1085  */
1086 static int
1087 kdmsg_autorxmsg(kdmsg_msg_t *msg)
1088 {
1089         kdmsg_iocom_t *iocom = msg->state->iocom;
1090         kdmsg_msg_t *rep;
1091         int error = 0;
1092         uint32_t cmd;
1093
1094         /*
1095          * Main switch processes transaction create/delete sequences only.
1096          * Use icmd (DELETEs use DMSG_LNK_ERROR
1097          *
1098          * NOTE: If processing in-transaction messages you generally want
1099          *       an inner switch on msg->any.head.cmd.
1100          */
1101         if (msg->state) {
1102                 cmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1103                       (msg->any.head.cmd & (DMSGF_CREATE |
1104                                             DMSGF_DELETE |
1105                                             DMSGF_REPLY));
1106         } else {
1107                 cmd = 0;
1108         }
1109
1110         switch(cmd) {
1111         case DMSG_LNK_PING:
1112                 /*
1113                  * Received ping, send reply
1114                  */
1115                 rep = kdmsg_msg_alloc(msg->state, DMSG_LNK_PING | DMSGF_REPLY,
1116                                       NULL, NULL);
1117                 kdmsg_msg_write(rep);
1118                 break;
1119         case DMSG_LNK_PING | DMSGF_REPLY:
1120                 /* ignore replies */
1121                 break;
1122         case DMSG_LNK_CONN | DMSGF_CREATE:
1123         case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
1124                 /*
1125                  * Received LNK_CONN transaction.  Transmit response and
1126                  * leave transaction open, which allows the other end to
1127                  * start to the SPAN protocol.
1128                  *
1129                  * Handle shim after acknowledging the CONN.
1130                  */
1131                 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1132                         if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1133                                 kdmsg_msg_result(msg, 0);
1134                                 if (iocom->auto_callback)
1135                                         iocom->auto_callback(msg);
1136                         } else {
1137                                 error = iocom->rcvmsg(msg);
1138                         }
1139                         break;
1140                 }
1141                 /* fall through */
1142         case DMSG_LNK_CONN | DMSGF_DELETE:
1143                 /*
1144                  * This message is usually simulated after a link is lost
1145                  * to clean up the transaction.
1146                  */
1147                 if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1148                         if (iocom->auto_callback)
1149                                 iocom->auto_callback(msg);
1150                         kdmsg_msg_reply(msg, 0);
1151                 } else {
1152                         error = iocom->rcvmsg(msg);
1153                 }
1154                 break;
1155         case DMSG_LNK_SPAN | DMSGF_CREATE:
1156         case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1157                 /*
1158                  * Received LNK_SPAN transaction.  We do not have to respond
1159                  * (except on termination), but we must leave the transaction
1160                  * open.
1161                  *
1162                  * Handle shim after acknowledging the SPAN.
1163                  */
1164                 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1165                         if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1166                                 if (iocom->auto_callback)
1167                                         iocom->auto_callback(msg);
1168                                 break;
1169                         }
1170                         /* fall through */
1171                 } else {
1172                         error = iocom->rcvmsg(msg);
1173                         break;
1174                 }
1175                 /* fall through */
1176         case DMSG_LNK_SPAN | DMSGF_DELETE:
1177                 /*
1178                  * Process shims (auto_callback) before cleaning up the
1179                  * circuit structure and closing the transactions.  Device
1180                  * driver should ensure that the circuit is not used after
1181                  * the auto_callback() returns.
1182                  *
1183                  * Handle shim before closing the SPAN transaction.
1184                  */
1185                 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1186                         if (iocom->auto_callback)
1187                                 iocom->auto_callback(msg);
1188                         kdmsg_msg_reply(msg, 0);
1189                 } else {
1190                         error = iocom->rcvmsg(msg);
1191                 }
1192                 break;
1193         default:
1194                 /*
1195                  * Anything unhandled goes into rcvmsg.
1196                  *
1197                  * NOTE: Replies to link-level messages initiated by our side
1198                  *       are handled by the state callback, they are NOT
1199                  *       handled here.
1200                  */
1201                 error = iocom->rcvmsg(msg);
1202                 break;
1203         }
1204         return (error);
1205 }
1206
1207 /*
1208  * Post-receive-handling message and state cleanup.  This routine is called
1209  * after the state function handling/callback to properly dispose of the
1210  * message and unlink the state's parent/subq linkage if the state is
1211  * completely closed.
1212  *
1213  * msglk is not held.
1214  */
1215 static
1216 void
1217 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1218 {
1219         kdmsg_state_t *state = msg->state;
1220         kdmsg_iocom_t *iocom = state->iocom;
1221
1222         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1223         if (state != &iocom->state0) {
1224                 /*
1225                  * When terminating a transaction (in either direction), all
1226                  * sub-states are aborted.
1227                  */
1228                 if ((msg->any.head.cmd & DMSGF_DELETE) &&
1229                     TAILQ_FIRST(&msg->state->subq)) {
1230                         kdio_printf(iocom, 2,
1231                                     "simulate failure for substates of "
1232                                     "state %p cmd %08x/%08x\n",
1233                                     msg->state,
1234                                     msg->state->rxcmd,
1235                                     msg->state->txcmd);
1236                         kdmsg_simulate_failure(msg->state,
1237                                                0, DMSG_ERR_LOSTLINK);
1238                 }
1239
1240                 /*
1241                  * Once the state is fully closed we can (try to) remove it
1242                  * from the subq topology.
1243                  */
1244                 if ((state->flags & KDMSG_STATE_SUBINSERTED) &&
1245                     (state->rxcmd & DMSGF_DELETE) &&
1246                     (state->txcmd & DMSGF_DELETE)) {
1247                         /* 
1248                          * Remove parent linkage if state is completely closed.
1249                          */
1250                         kdmsg_subq_delete(state);
1251                 }
1252         }
1253         kdmsg_msg_free(msg);
1254
1255         lockmgr(&iocom->msglk, LK_RELEASE);
1256 }
1257
1258 /*
1259  * Remove state from its parent's subq.  This can wind up recursively
1260  * dropping the parent upward.
1261  *
1262  * NOTE: Once we drop the parent, our pstate pointer may become invalid.
1263  */
1264 static
1265 void
1266 kdmsg_subq_delete(kdmsg_state_t *state)
1267 {
1268         kdmsg_state_t *pstate;
1269
1270         if (state->flags & KDMSG_STATE_SUBINSERTED) {
1271                 pstate = state->parent;
1272                 KKASSERT(pstate);
1273                 if (pstate->scan == state)
1274                         pstate->scan = NULL;
1275                 TAILQ_REMOVE(&pstate->subq, state, entry);
1276                 state->flags &= ~KDMSG_STATE_SUBINSERTED;
1277                 state->parent = NULL;
1278                 if (TAILQ_EMPTY(&pstate->subq)) {
1279                         kdmsg_state_drop(pstate);/* pstate->subq */
1280                 }
1281                 pstate = NULL;                   /* safety */
1282                 kdmsg_state_drop(state);         /* pstate->subq */
1283         } else {
1284                 KKASSERT(state->parent == NULL);
1285         }
1286 }
1287
1288 /*
1289  * Simulate receiving a message which terminates an active transaction
1290  * state.  Our simulated received message must set DELETE and may also
1291  * have to set CREATE.  It must also ensure that all fields are set such
1292  * that the receive handling code can find the state (kdmsg_state_msgrx())
1293  * or an endless loop will ensue.
1294  *
1295  * This is used when the other end of the link is dead so the device driver
1296  * gets a completed transaction for all pending states.
1297  *
1298  * Called with iocom locked.
1299  */
1300 static
1301 void
1302 kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error)
1303 {
1304         kdmsg_state_t *substate;
1305
1306         kdmsg_state_hold(state);                /* aborting */
1307
1308         /*
1309          * Abort parent state first. Parent will not actually disappear
1310          * until children are gone.  Device drivers must handle the situation.
1311          * The advantage of this is that device drivers can flag the situation
1312          * as an interlock against new operations on dying states.  And since
1313          * device operations are often asynchronous anyway, this sequence of
1314          * events works out better.
1315          */
1316         if (meto)
1317                 kdmsg_state_abort(state);
1318
1319         /*
1320          * Recurse through any children.
1321          */
1322 again:
1323         TAILQ_FOREACH(substate, &state->subq, entry) {
1324                 if (substate->flags & KDMSG_STATE_ABORTING)
1325                         continue;
1326                 state->scan = substate;
1327                 kdmsg_simulate_failure(substate, 1, error);
1328                 if (state->scan != substate)
1329                         goto again;
1330         }
1331         kdmsg_state_drop(state);                /* aborting */
1332 }
1333
1334 static
1335 void
1336 kdmsg_state_abort(kdmsg_state_t *state)
1337 {
1338         kdmsg_msg_t *msg;
1339
1340         /*
1341          * Set ABORTING and DYING, return if already set.  If the state was
1342          * just allocated we defer the abort operation until the related
1343          * message is processed.
1344          */
1345         KKASSERT((state->flags & KDMSG_STATE_ABORTING) == 0);
1346         if (state->flags & KDMSG_STATE_ABORTING)
1347                 return;
1348         state->flags |= KDMSG_STATE_ABORTING;
1349         kdmsg_state_dying(state);
1350         if (state->flags & KDMSG_STATE_NEW) {
1351                 kdio_printf(iocom, 5,
1352                             "kdmsg_state_abort(0): state %p rxcmd %08x "
1353                             "txcmd %08x flags %08x - in NEW state\n",
1354                             state, state->rxcmd,
1355                             state->txcmd, state->flags);
1356                 return;
1357         }
1358
1359         /*
1360          * NOTE: The DELETE flag might already be set due to an early
1361          *       termination.
1362          *
1363          * NOTE: Args to kdmsg_msg_alloc() to avoid dynamic state allocation.
1364          *
1365          * NOTE: We are simulating a received message using our state
1366          *       (vs a message generated by the other side using its state),
1367          *       so we must invert DMSGF_REVTRANS and DMSGF_REVCIRC.
1368          */
1369         kdio_printf(iocom, 5, 
1370                     "kdmsg_state_abort(1): state %p rxcmd %08x txcmd %08x\n",
1371                     state, state->rxcmd, state->txcmd);
1372         if ((state->rxcmd & DMSGF_DELETE) == 0) {
1373                 msg = kdmsg_msg_alloc(state, DMSG_LNK_ERROR, NULL, NULL);
1374                 if ((state->rxcmd & DMSGF_CREATE) == 0)
1375                         msg->any.head.cmd |= DMSGF_CREATE;
1376                 msg->any.head.cmd |= DMSGF_DELETE |
1377                                      (state->rxcmd & DMSGF_REPLY);
1378                 msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
1379                 msg->any.head.error = DMSG_ERR_LOSTLINK;
1380                 kdio_printf(iocom, 5,
1381                             "kdmsg_state_abort(a): state %p msgcmd %08x\n",
1382                             state, msg->any.head.cmd);
1383                 /* circuit not initialized */
1384                 lockmgr(&state->iocom->msglk, LK_RELEASE);
1385                 kdmsg_msg_receive_handling(msg);
1386                 lockmgr(&state->iocom->msglk, LK_EXCLUSIVE);
1387                 msg = NULL;
1388         }
1389         kdio_printf(iocom, 5,
1390                     "kdmsg_state_abort(2): state %p rxcmd %08x txcmd %08x\n",
1391                     state, state->rxcmd, state->txcmd);
1392 }
1393
1394 /*
1395  * Recursively sets KDMSG_STATE_DYING on state and all sub-states, preventing
1396  * the transmission of any new messages on these states.  This is done
1397  * atomically when parent state is terminating, whereas setting ABORTING is
1398  * not atomic and can leak races.
1399  */
1400 static
1401 void
1402 kdmsg_state_dying(kdmsg_state_t *state)
1403 {
1404         kdmsg_state_t *scan;
1405
1406         if ((state->flags & KDMSG_STATE_DYING) == 0) {
1407                 state->flags |= KDMSG_STATE_DYING;
1408                 TAILQ_FOREACH(scan, &state->subq, entry)
1409                         kdmsg_state_dying(scan);
1410         }
1411 }
1412
1413 /*
1414  * Process state tracking for a message prior to transmission.
1415  *
1416  * Called with msglk held and the msg dequeued.  Returns non-zero if
1417  * the message is bad and should be deleted by the caller.
1418  *
1419  * One-off messages are usually with dummy state and msg->state may be NULL
1420  * in this situation.
1421  *
1422  * New transactions (when CREATE is set) will insert the state.
1423  *
1424  * May request that caller discard the message by setting *discardp to 1.
1425  * A NULL state may be returned in this case.
1426  */
1427 static
1428 int
1429 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1430 {
1431         kdmsg_iocom_t *iocom = msg->state->iocom;
1432         kdmsg_state_t *state;
1433         int error;
1434
1435         /*
1436          * Make sure a state structure is ready to go in case we need a new
1437          * one.  This is the only routine which uses freewr_state so no
1438          * races are possible.
1439          */
1440         if ((state = iocom->freewr_state) == NULL) {
1441                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1442                 state->flags = KDMSG_STATE_DYNAMIC;
1443                 state->iocom = iocom;
1444                 state->refs = 1;
1445                 TAILQ_INIT(&state->subq);
1446                 iocom->freewr_state = state;
1447         }
1448
1449         /*
1450          * Lock RB tree.  If persistent state is present it will have already
1451          * been assigned to msg.
1452          */
1453         state = msg->state;
1454
1455         /*
1456          * Short-cut one-off or mid-stream messages (state may be NULL).
1457          */
1458         if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1459                                   DMSGF_ABORT)) == 0) {
1460                 return(0);
1461         }
1462
1463
1464         /*
1465          * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1466          * inside the case statements.
1467          */
1468         switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1469                                     DMSGF_REPLY)) {
1470         case DMSGF_CREATE:
1471         case DMSGF_CREATE | DMSGF_DELETE:
1472                 /*
1473                  * Insert the new persistent message state and mark
1474                  * half-closed if DELETE is set.  Since this is a new
1475                  * message it isn't possible to transition into the fully
1476                  * closed state here.
1477                  *
1478                  * XXX state must be assigned and inserted by
1479                  *     kdmsg_msg_write().  txcmd is assigned by us
1480                  *     on-transmit.
1481                  */
1482                 KKASSERT(state != NULL);
1483                 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1484                 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1485                 state->rxcmd = DMSGF_REPLY;
1486                 state->flags &= ~KDMSG_STATE_NEW;
1487                 error = 0;
1488                 break;
1489         case DMSGF_DELETE:
1490                 /*
1491                  * Sent ABORT+DELETE in case where msgid has already
1492                  * been fully closed, ignore the message.
1493                  */
1494                 if (state == &iocom->state0) {
1495                         if (msg->any.head.cmd & DMSGF_ABORT) {
1496                                 error = EALREADY;
1497                         } else {
1498                                 kdio_printf(iocom, 1,
1499                                         "msgtx: no state match "
1500                                         "for DELETE cmd=%08x msgid=%016jx\n",
1501                                         msg->any.head.cmd,
1502                                         (intmax_t)msg->any.head.msgid);
1503                                 error = EINVAL;
1504                         }
1505                         break;
1506                 }
1507
1508                 /*
1509                  * Sent ABORT+DELETE in case where msgid has
1510                  * already been reused for an unrelated message,
1511                  * ignore the message.
1512                  */
1513                 if ((state->txcmd & DMSGF_CREATE) == 0) {
1514                         if (msg->any.head.cmd & DMSGF_ABORT) {
1515                                 error = EALREADY;
1516                         } else {
1517                                 kdio_printf(iocom, 1, "%s\n",
1518                                             "msgtx: state reused "
1519                                             "for DELETE");
1520                                 error = EINVAL;
1521                         }
1522                         break;
1523                 }
1524                 error = 0;
1525                 break;
1526         default:
1527                 /*
1528                  * Check for mid-stream ABORT command sent
1529                  */
1530                 if (msg->any.head.cmd & DMSGF_ABORT) {
1531                         if (state == &state->iocom->state0 ||
1532                             (state->txcmd & DMSGF_CREATE) == 0) {
1533                                 error = EALREADY;
1534                                 break;
1535                         }
1536                 }
1537                 error = 0;
1538                 break;
1539         case DMSGF_REPLY | DMSGF_CREATE:
1540         case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1541                 /*
1542                  * When transmitting a reply with CREATE set the original
1543                  * persistent state message should already exist.
1544                  */
1545                 if (state == &state->iocom->state0) {
1546                         kdio_printf(iocom, 1, "%s\n",
1547                                     "msgtx: no state match "
1548                                     "for REPLY | CREATE");
1549                         error = EINVAL;
1550                         break;
1551                 }
1552                 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1553                 error = 0;
1554                 break;
1555         case DMSGF_REPLY | DMSGF_DELETE:
1556                 /*
1557                  * When transmitting a reply with DELETE set the original
1558                  * persistent state message should already exist.
1559                  *
1560                  * This is very similar to the REPLY|CREATE|* case except
1561                  * txcmd is already stored, so we just add the DELETE flag.
1562                  *
1563                  * Sent REPLY+ABORT+DELETE in case where msgid has
1564                  * already been fully closed, ignore the message.
1565                  */
1566                 if (state == &state->iocom->state0) {
1567                         if (msg->any.head.cmd & DMSGF_ABORT) {
1568                                 error = EALREADY;
1569                         } else {
1570                                 kdio_printf(iocom, 1, "%s\n",
1571                                             "msgtx: no state match "
1572                                             "for REPLY | DELETE");
1573                                 error = EINVAL;
1574                         }
1575                         break;
1576                 }
1577
1578                 /*
1579                  * Sent REPLY+ABORT+DELETE in case where msgid has already
1580                  * been reused for an unrelated message, ignore the message.
1581                  */
1582                 if ((state->txcmd & DMSGF_CREATE) == 0) {
1583                         if (msg->any.head.cmd & DMSGF_ABORT) {
1584                                 error = EALREADY;
1585                         } else {
1586                                 kdio_printf(iocom, 1, "%s\n",
1587                                             "msgtx: state reused "
1588                                             "for REPLY | DELETE");
1589                                 error = EINVAL;
1590                         }
1591                         break;
1592                 }
1593                 error = 0;
1594                 break;
1595         case DMSGF_REPLY:
1596                 /*
1597                  * Check for mid-stream ABORT reply sent.
1598                  *
1599                  * One-off REPLY messages are allowed for e.g. status updates.
1600                  */
1601                 if (msg->any.head.cmd & DMSGF_ABORT) {
1602                         if (state == &state->iocom->state0 ||
1603                             (state->txcmd & DMSGF_CREATE) == 0) {
1604                                 error = EALREADY;
1605                                 break;
1606                         }
1607                 }
1608                 error = 0;
1609                 break;
1610         }
1611
1612         /*
1613          * Set interlock (XXX hack) in case the send side blocks and a
1614          * response is returned before kdmsg_state_cleanuptx() can be
1615          * run.
1616          */
1617         if (state && error == 0)
1618                 state->flags |= KDMSG_STATE_INTERLOCK;
1619
1620         return (error);
1621 }
1622
1623 /*
1624  * Called with iocom locked.
1625  */
1626 static
1627 void
1628 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1629 {
1630         kdmsg_iocom_t *iocom = msg->state->iocom;
1631         kdmsg_state_t *state;
1632
1633         if ((state = msg->state) == NULL) {
1634                 kdmsg_msg_free(msg);
1635                 return;
1636         }
1637
1638         /*
1639          * Clear interlock (XXX hack) in case the send side blocks and a
1640          * response is returned in the other thread before
1641          * kdmsg_state_cleanuptx() can be run.  We maintain our hold on
1642          * iocom->msglk so we can do this before completing our task.
1643          */
1644         if (state->flags & KDMSG_STATE_SIGNAL) {
1645                 kdio_printf(iocom, 1, "state %p interlock!\n", state);
1646                 wakeup(state);
1647         }
1648         state->flags &= ~(KDMSG_STATE_INTERLOCK | KDMSG_STATE_SIGNAL);
1649         kdmsg_state_hold(state);
1650
1651         if (msg->any.head.cmd & DMSGF_DELETE) {
1652                 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1653                 state->txcmd |= DMSGF_DELETE;
1654                 if (state->rxcmd & DMSGF_DELETE) {
1655                         KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1656                         if (state->txcmd & DMSGF_REPLY) {
1657                                 KKASSERT(msg->any.head.cmd &
1658                                          DMSGF_REPLY);
1659                                 RB_REMOVE(kdmsg_state_tree,
1660                                           &iocom->staterd_tree, state);
1661                         } else {
1662                                 KKASSERT((msg->any.head.cmd &
1663                                           DMSGF_REPLY) == 0);
1664                                 RB_REMOVE(kdmsg_state_tree,
1665                                           &iocom->statewr_tree, state);
1666                         }
1667                         state->flags &= ~KDMSG_STATE_RBINSERTED;
1668
1669                         /*
1670                          * The subq recursion is used for parent linking and
1671                          * scanning the topology for aborts, we can only
1672                          * remove leafs.  The circuit is effectively dead now,
1673                          * but topology won't be torn down until all of its
1674                          * children have finished/aborted.
1675                          *
1676                          * This is particularly important for end-point
1677                          * devices which might need to access private data
1678                          * in parent states.  Out of order disconnects can
1679                          * occur if an end-point device is processing a
1680                          * message transaction asynchronously because abort
1681                          * requests are basically synchronous and it probably
1682                          * isn't convenient (or possible) for the end-point
1683                          * to abort an asynchronous operation.
1684                          */
1685                         if (TAILQ_EMPTY(&state->subq))
1686                                 kdmsg_subq_delete(state);
1687                         kdmsg_msg_free(msg);
1688                         kdmsg_state_drop(state);   /* state on rbtree */
1689                 } else {
1690                         kdmsg_msg_free(msg);
1691                 }
1692         } else {
1693                 kdmsg_msg_free(msg);
1694         }
1695
1696         /*
1697          * Deferred abort after transmission.
1698          */
1699         if ((state->flags & (KDMSG_STATE_ABORTING | KDMSG_STATE_DYING)) &&
1700             (state->rxcmd & DMSGF_DELETE) == 0) {
1701                 kdio_printf(iocom, 5,
1702                             "kdmsg_state_cleanuptx: state=%p "
1703                             "executing deferred abort\n",
1704                             state);
1705                 state->flags &= ~KDMSG_STATE_ABORTING;
1706                 kdmsg_state_abort(state);
1707         }
1708         kdmsg_state_drop(state);
1709 }
1710
1711 static
1712 void
1713 _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1714 {
1715         atomic_add_int(&state->refs, 1);
1716 #if KDMSG_DEBUG
1717         kd_printf(4, "state %p +%d\t%s:%d\n", state, state->refs, file, line);
1718 #endif
1719 }
1720
1721 static
1722 void
1723 _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1724 {
1725         KKASSERT(state->refs > 0);
1726 #if KDMSG_DEBUG
1727         kd_printf(4, "state %p -%d\t%s:%d\n", state, state->refs, file, line);
1728 #endif
1729         if (atomic_fetchadd_int(&state->refs, -1) == 1)
1730                 kdmsg_state_free(state);
1731 }
1732
1733 static
1734 void
1735 kdmsg_state_free(kdmsg_state_t *state)
1736 {
1737         kdmsg_iocom_t *iocom = state->iocom;
1738
1739         KKASSERT((state->flags & KDMSG_STATE_RBINSERTED) == 0);
1740         KKASSERT((state->flags & KDMSG_STATE_SUBINSERTED) == 0);
1741         KKASSERT(TAILQ_EMPTY(&state->subq));
1742
1743         if (state != &state->iocom->state0)
1744                 kfree(state, iocom->mmsg);
1745 }
1746
1747 kdmsg_msg_t *
1748 kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
1749                 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1750 {
1751         kdmsg_iocom_t *iocom = state->iocom;
1752         kdmsg_state_t *pstate;
1753         kdmsg_msg_t *msg;
1754         size_t hbytes;
1755
1756         KKASSERT(iocom != NULL);
1757         hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1758         msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1759                       iocom->mmsg, M_WAITOK | M_ZERO);
1760         msg->hdr_size = hbytes;
1761
1762         if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
1763                 /*
1764                  * New transaction, requires tracking state and a unique
1765                  * msgid to be allocated.
1766                  *
1767                  * It is possible to race a circuit failure, inherit the
1768                  * parent's STATE_DYING flag to trigger an abort sequence
1769                  * in the transmit path.  By not inheriting ABORTING the
1770                  * abort sequence can recurse.
1771                  *
1772                  * NOTE: The transactions has not yet been initiated so we
1773                  *       cannot set DMSGF_CREATE/DELETE bits in txcmd or rxcmd.
1774                  *       We have to properly setup DMSGF_REPLY, however.
1775                  */
1776                 pstate = state;
1777                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1778                 TAILQ_INIT(&state->subq);
1779                 state->iocom = iocom;
1780                 state->parent = pstate;
1781                 state->flags = KDMSG_STATE_DYNAMIC |
1782                                KDMSG_STATE_NEW;
1783                 state->func = func;
1784                 state->any.any = data;
1785                 state->msgid = (uint64_t)(uintptr_t)state;
1786                 /*msg->any.head.msgid = state->msgid;XXX*/
1787
1788                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1789                 if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1790                         panic("duplicate msgid allocated");
1791                 if (TAILQ_EMPTY(&pstate->subq))
1792                         kdmsg_state_hold(pstate);/* pstate->subq */
1793                 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
1794                 state->flags |= KDMSG_STATE_RBINSERTED |
1795                                 KDMSG_STATE_SUBINSERTED;
1796                 state->flags |= pstate->flags & KDMSG_STATE_DYING;
1797                 kdmsg_state_hold(state);        /* pstate->subq */
1798                 kdmsg_state_hold(state);        /* state on rbtree */
1799                 kdmsg_state_hold(state);        /* msg->state */
1800                 lockmgr(&iocom->msglk, LK_RELEASE);
1801         } else {
1802                 pstate = state->parent;
1803                 KKASSERT(pstate != NULL);
1804                 kdmsg_state_hold(state);        /* msg->state */
1805         }
1806
1807         if (state->flags & KDMSG_STATE_OPPOSITE)
1808                 cmd |= DMSGF_REVTRANS;
1809         if (pstate->flags & KDMSG_STATE_OPPOSITE)
1810                 cmd |= DMSGF_REVCIRC;
1811
1812         msg->any.head.magic = DMSG_HDR_MAGIC;
1813         msg->any.head.cmd = cmd;
1814         msg->any.head.msgid = state->msgid;
1815         msg->any.head.circuit = pstate->msgid;
1816         msg->state = state;
1817
1818         return (msg);
1819 }
1820
1821 void
1822 kdmsg_msg_free(kdmsg_msg_t *msg)
1823 {
1824         kdmsg_iocom_t *iocom = msg->state->iocom;
1825         kdmsg_state_t *state;
1826
1827         if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1828             msg->aux_data && msg->aux_size) {
1829                 kfree(msg->aux_data, iocom->mmsg);
1830                 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1831         }
1832         if ((state = msg->state) != NULL) {
1833                 msg->state = NULL;
1834                 kdmsg_state_drop(state);        /* msg->state */
1835         }
1836         msg->aux_data = NULL;
1837         msg->aux_size = 0;
1838
1839         kfree(msg, iocom->mmsg);
1840 }
1841
1842 void
1843 kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data)
1844 {
1845         if (msg->flags & KDMSG_FLAG_AUXALLOC) {
1846                 data->aux_data = msg->aux_data;
1847                 data->aux_size = msg->aux_size;
1848                 data->iocom = msg->state->iocom;
1849                 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1850         } else {
1851                 data->aux_data = NULL;
1852                 data->aux_size = 0;
1853                 data->iocom = msg->state->iocom;
1854         }
1855 }
1856
1857 void
1858 kdmsg_free_aux_data(kdmsg_data_t *data)
1859 {
1860         if (data->aux_data)
1861                 kfree(data->aux_data, data->iocom->mmsg);
1862 }
1863
1864 /*
1865  * Indexed messages are stored in a red-black tree indexed by their
1866  * msgid.  Only persistent messages are indexed.
1867  */
1868 int
1869 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1870 {
1871         if (state1->iocom < state2->iocom)
1872                 return(-1);
1873         if (state1->iocom > state2->iocom)
1874                 return(1);
1875         if (state1->msgid < state2->msgid)
1876                 return(-1);
1877         if (state1->msgid > state2->msgid)
1878                 return(1);
1879         return(0);
1880 }
1881
1882 /*
1883  * Write a message.  All requisit command flags have been set.
1884  *
1885  * If msg->state is non-NULL the message is written to the existing
1886  * transaction.  msgid will be set accordingly.
1887  *
1888  * If msg->state is NULL and CREATE is set new state is allocated and
1889  * (func, data) is installed.  A msgid is assigned.
1890  *
1891  * If msg->state is NULL and CREATE is not set the message is assumed
1892  * to be a one-way message.  The originator must assign the msgid
1893  * (or leave it 0, which is typical.
1894  *
1895  * This function merely queues the message to the management thread, it
1896  * does not write to the message socket/pipe.
1897  */
1898 void
1899 kdmsg_msg_write(kdmsg_msg_t *msg)
1900 {
1901         kdmsg_iocom_t *iocom = msg->state->iocom;
1902
1903         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1904         kdmsg_msg_write_locked(iocom, msg);
1905         lockmgr(&iocom->msglk, LK_RELEASE);
1906 }
1907
1908 static void
1909 kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg)
1910 {
1911         kdmsg_state_t *state;
1912
1913         if (msg->state) {
1914                 /*
1915                  * Continuance or termination of existing transaction.
1916                  * The transaction could have been initiated by either end.
1917                  *
1918                  * (Function callback and aux data for the receive side can
1919                  * be replaced or left alone).
1920                  */
1921                 state = msg->state;
1922                 msg->any.head.msgid = state->msgid;
1923         } else {
1924                 /*
1925                  * One-off message (always uses msgid 0 to distinguish
1926                  * between a possibly lost in-transaction message due to
1927                  * competing aborts and a real one-off message?)
1928                  */
1929                 state = NULL;
1930                 msg->any.head.msgid = 0;
1931         }
1932
1933 #if 0
1934         /*
1935          * XXX removed - don't make this a panic, allow the state checks
1936          *     below to catch the situation.
1937          *
1938          * This flag is not set until after the tx thread has drained
1939          * the tx msgq and simulated responses.  After that point the
1940          * txthread is dead and can no longer simulate responses.
1941          *
1942          * Device drivers should never try to send a message once this
1943          * flag is set.  They should have detected (through the state
1944          * closures) that the link is in trouble.
1945          */
1946         if (iocom->flags & KDMSG_IOCOMF_EXITNOACC) {
1947                 lockmgr(&iocom->msglk, LK_RELEASE);
1948                 panic("kdmsg_msg_write: Attempt to write message to "
1949                       "terminated iocom\n");
1950         }
1951 #endif
1952
1953         /*
1954          * For stateful messages, if the circuit is dead or dying we have
1955          * to abort the potentially newly-created state and discard the
1956          * message.
1957          *
1958          * - We must discard the message because the other end will not
1959          *   be expecting any more messages over the dead or dying circuit
1960          *   and might not be able to receive them.
1961          *
1962          * - We abort the state by simulating a failure to generate a fake
1963          *   incoming DELETE.  This will trigger the state callback and allow
1964          *   the device to clean things up and reply, closing the outgoing
1965          *   direction and allowing the state to be freed.
1966          *
1967          * This situation occurs quite often, particularly as SPANs stabilize.
1968          * End-points must do the right thing.
1969          */
1970         if (state) {
1971                 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1972                 if (state->flags & KDMSG_STATE_DYING) {
1973 #if 0
1974                 if ((state->flags & KDMSG_STATE_DYING) ||
1975                     (state->parent->txcmd & DMSGF_DELETE) ||
1976                     (state->parent->flags & KDMSG_STATE_DYING)) {
1977 #endif
1978                         kdio_printf(iocom, 4,
1979                                     "kdmsg_msg_write: Write to dying circuit "
1980                                     "state=%p "
1981                                     "ptxcmd=%08x prxcmd=%08x flags=%08x\n",
1982                                     state,
1983                                     state->parent->rxcmd,
1984                                     state->parent->txcmd,
1985                                     state->parent->flags);
1986                         kdmsg_state_hold(state);
1987                         kdmsg_state_msgtx(msg);
1988                         kdmsg_state_cleanuptx(msg);
1989                         kdmsg_state_drop(state);
1990                         return;
1991                 }
1992         }
1993
1994         /*
1995          * Finish up the msg fields.  Note that msg->aux_size and the
1996          * aux_bytes stored in the message header represent the unaligned
1997          * (actual) bytes of data, but the buffer is sized to an aligned
1998          * size and the CRC is generated over the aligned length.
1999          */
2000         msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
2001         ++iocom->msg_seq;
2002
2003         if (msg->aux_data && msg->aux_size) {
2004                 uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
2005
2006                 msg->any.head.aux_bytes = msg->aux_size;
2007                 msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
2008         }
2009         msg->any.head.hdr_crc = 0;
2010         msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
2011
2012         TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
2013
2014         if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
2015                 atomic_clear_int(&iocom->msg_ctl,
2016                                  KDMSG_CLUSTERCTL_SLEEPING);
2017                 wakeup(&iocom->msg_ctl);
2018         }
2019 }
2020
2021 /*
2022  * Reply to a message and terminate our side of the transaction.
2023  *
2024  * If msg->state is non-NULL we are replying to a one-way message.
2025  */
2026 void
2027 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
2028 {
2029         kdmsg_state_t *state = msg->state;
2030         kdmsg_msg_t *nmsg;
2031         uint32_t cmd;
2032
2033         /*
2034          * Reply with a simple error code and terminate the transaction.
2035          */
2036         cmd = DMSG_LNK_ERROR;
2037
2038         /*
2039          * Check if our direction has even been initiated yet, set CREATE.
2040          *
2041          * Check what direction this is (command or reply direction).  Note
2042          * that txcmd might not have been initiated yet.
2043          *
2044          * If our direction has already been closed we just return without
2045          * doing anything.
2046          */
2047         if (state != &state->iocom->state0) {
2048                 if (state->txcmd & DMSGF_DELETE)
2049                         return;
2050                 if ((state->txcmd & DMSGF_CREATE) == 0)
2051                         cmd |= DMSGF_CREATE;
2052                 if (state->txcmd & DMSGF_REPLY)
2053                         cmd |= DMSGF_REPLY;
2054                 cmd |= DMSGF_DELETE;
2055         } else {
2056                 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2057                         cmd |= DMSGF_REPLY;
2058         }
2059
2060         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2061         nmsg->any.head.error = error;
2062         kdmsg_msg_write(nmsg);
2063 }
2064
2065 /*
2066  * Reply to a message and continue our side of the transaction.
2067  *
2068  * If msg->state is non-NULL we are replying to a one-way message and this
2069  * function degenerates into the same as kdmsg_msg_reply().
2070  */
2071 void
2072 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
2073 {
2074         kdmsg_state_t *state = msg->state;
2075         kdmsg_msg_t *nmsg;
2076         uint32_t cmd;
2077
2078         /*
2079          * Return a simple result code, do NOT terminate the transaction.
2080          */
2081         cmd = DMSG_LNK_ERROR;
2082
2083         /*
2084          * Check if our direction has even been initiated yet, set CREATE.
2085          *
2086          * Check what direction this is (command or reply direction).  Note
2087          * that txcmd might not have been initiated yet.
2088          *
2089          * If our direction has already been closed we just return without
2090          * doing anything.
2091          */
2092         if (state != &state->iocom->state0) {
2093                 if (state->txcmd & DMSGF_DELETE)
2094                         return;
2095                 if ((state->txcmd & DMSGF_CREATE) == 0)
2096                         cmd |= DMSGF_CREATE;
2097                 if (state->txcmd & DMSGF_REPLY)
2098                         cmd |= DMSGF_REPLY;
2099                 /* continuing transaction, do not set MSGF_DELETE */
2100         } else {
2101                 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2102                         cmd |= DMSGF_REPLY;
2103         }
2104
2105         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2106         nmsg->any.head.error = error;
2107         kdmsg_msg_write(nmsg);
2108 }
2109
2110 /*
2111  * Reply to a message and terminate our side of the transaction.
2112  *
2113  * If msg->state is non-NULL we are replying to a one-way message.
2114  */
2115 void
2116 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
2117 {
2118         kdmsg_msg_t *nmsg;
2119         uint32_t cmd;
2120
2121         /*
2122          * Reply with a simple error code and terminate the transaction.
2123          */
2124         cmd = DMSG_LNK_ERROR;
2125
2126         /*
2127          * Check if our direction has even been initiated yet, set CREATE.
2128          *
2129          * Check what direction this is (command or reply direction).  Note
2130          * that txcmd might not have been initiated yet.
2131          *
2132          * If our direction has already been closed we just return without
2133          * doing anything.
2134          */
2135         KKASSERT(state);
2136         if (state->txcmd & DMSGF_DELETE)
2137                 return;
2138         if ((state->txcmd & DMSGF_CREATE) == 0)
2139                 cmd |= DMSGF_CREATE;
2140         if (state->txcmd & DMSGF_REPLY)
2141                 cmd |= DMSGF_REPLY;
2142         cmd |= DMSGF_DELETE;
2143
2144         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2145         nmsg->any.head.error = error;
2146         kdmsg_msg_write(nmsg);
2147 }
2148
2149 /*
2150  * Reply to a message and continue our side of the transaction.
2151  *
2152  * If msg->state is non-NULL we are replying to a one-way message and this
2153  * function degenerates into the same as kdmsg_msg_reply().
2154  */
2155 void
2156 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
2157 {
2158         kdmsg_msg_t *nmsg;
2159         uint32_t cmd;
2160
2161         /*
2162          * Return a simple result code, do NOT terminate the transaction.
2163          */
2164         cmd = DMSG_LNK_ERROR;
2165
2166         /*
2167          * Check if our direction has even been initiated yet, set CREATE.
2168          *
2169          * Check what direction this is (command or reply direction).  Note
2170          * that txcmd might not have been initiated yet.
2171          *
2172          * If our direction has already been closed we just return without
2173          * doing anything.
2174          */
2175         KKASSERT(state);
2176         if (state->txcmd & DMSGF_DELETE)
2177                 return;
2178         if ((state->txcmd & DMSGF_CREATE) == 0)
2179                 cmd |= DMSGF_CREATE;
2180         if (state->txcmd & DMSGF_REPLY)
2181                 cmd |= DMSGF_REPLY;
2182         /* continuing transaction, do not set MSGF_DELETE */
2183
2184         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2185         nmsg->any.head.error = error;
2186         kdmsg_msg_write(nmsg);
2187 }