kernel - Add reapctl() system call for managing sub-processes
[dragonfly.git] / sys / kern / kern_dmsg.c
1 /*-
2  * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * TODO: txcmd CREATE state is deferred by txmsgq, need to calculate
36  *       a streaming response.  See subr_diskiocom()'s diskiodone().
37  */
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/priv.h>
52 #include <sys/thread.h>
53 #include <sys/globaldata.h>
54 #include <sys/limits.h>
55
56 #include <sys/dmsg.h>
57
58 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
59
60 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
61 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
62 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
63 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
64 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
65 static void kdmsg_state_abort(kdmsg_state_t *state);
66 static void kdmsg_state_free(kdmsg_state_t *state);
67
68 static void kdmsg_iocom_thread_rd(void *arg);
69 static void kdmsg_iocom_thread_wr(void *arg);
70 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
71
72 /*static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);*/
73
74 /*
75  * Initialize the roll-up communications structure for a network
76  * messaging session.  This function does not install the socket.
77  */
78 void
79 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
80                  struct malloc_type *mmsg,
81                  int (*rcvmsg)(kdmsg_msg_t *msg))
82 {
83         bzero(iocom, sizeof(*iocom));
84         iocom->handle = handle;
85         iocom->mmsg = mmsg;
86         iocom->rcvmsg = rcvmsg;
87         iocom->flags = flags;
88         lockinit(&iocom->msglk, "h2msg", 0, 0);
89         TAILQ_INIT(&iocom->msgq);
90         RB_INIT(&iocom->staterd_tree);
91         RB_INIT(&iocom->statewr_tree);
92
93         iocom->state0.iocom = iocom;
94         iocom->state0.parent = &iocom->state0;
95         TAILQ_INIT(&iocom->state0.subq);
96 }
97
98 /*
99  * [Re]connect using the passed file pointer.  The caller must ref the
100  * fp for us.  We own that ref now.
101  */
102 void
103 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
104                       const char *subsysname)
105 {
106         /*
107          * Destroy the current connection
108          */
109         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
110         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
111         while (iocom->msgrd_td || iocom->msgwr_td) {
112                 wakeup(&iocom->msg_ctl);
113                 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
114         }
115
116         /*
117          * Drop communications descriptor
118          */
119         if (iocom->msg_fp) {
120                 fdrop(iocom->msg_fp);
121                 iocom->msg_fp = NULL;
122         }
123
124         /*
125          * Setup new communications descriptor
126          */
127         iocom->msg_ctl = 0;
128         iocom->msg_fp = fp;
129         iocom->msg_seq = 0;
130         iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
131
132         lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
133                     NULL, 0, -1, "%s-msgrd", subsysname);
134         lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
135                     NULL, 0, -1, "%s-msgwr", subsysname);
136         lockmgr(&iocom->msglk, LK_RELEASE);
137 }
138
139 /*
140  * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
141  * this function to handle the state machine for LNK_CONN and LNK_SPAN.
142  */
143 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
144 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
145
146 void
147 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
148                          void (*auto_callback)(kdmsg_msg_t *msg))
149 {
150         kdmsg_msg_t *msg;
151
152         iocom->auto_callback = auto_callback;
153
154         msg = kdmsg_msg_alloc(&iocom->state0,
155                               DMSG_LNK_CONN | DMSGF_CREATE,
156                               kdmsg_lnk_conn_reply, NULL);
157         iocom->auto_lnk_conn.head = msg->any.head;
158         msg->any.lnk_conn = iocom->auto_lnk_conn;
159         iocom->conn_state = msg->state;
160         kdmsg_msg_write(msg);
161 }
162
163 static
164 int
165 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
166 {
167         kdmsg_iocom_t *iocom = state->iocom;
168         kdmsg_msg_t *rmsg;
169
170         /*
171          * Upon receipt of the LNK_CONN acknowledgement initiate an
172          * automatic SPAN if we were asked to.  Used by e.g. xdisk, but
173          * not used by HAMMER2 which must manage more than one transmitted
174          * SPAN.
175          */
176         if ((msg->any.head.cmd & DMSGF_CREATE) &&
177             (iocom->flags & KDMSG_IOCOMF_AUTOTXSPAN)) {
178                 rmsg = kdmsg_msg_alloc(&iocom->state0,
179                                        DMSG_LNK_SPAN | DMSGF_CREATE,
180                                        kdmsg_lnk_span_reply, NULL);
181                 iocom->auto_lnk_span.head = rmsg->any.head;
182                 rmsg->any.lnk_span = iocom->auto_lnk_span;
183                 kdmsg_msg_write(rmsg);
184         }
185
186         /*
187          * Process shim after the CONN is acknowledged and before the CONN
188          * transaction is deleted.  For deletions this gives device drivers
189          * the ability to interlock new operations on the circuit before
190          * it becomes illegal and panics.
191          */
192         if (iocom->auto_callback)
193                 iocom->auto_callback(msg);
194
195         if ((state->txcmd & DMSGF_DELETE) == 0 &&
196             (msg->any.head.cmd & DMSGF_DELETE)) {
197                 iocom->conn_state = NULL;
198                 kdmsg_msg_reply(msg, 0);
199         }
200
201         return (0);
202 }
203
204 static
205 int
206 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
207 {
208         /*
209          * Be sure to process shim before terminating the SPAN
210          * transaction.  Gives device drivers the ability to
211          * interlock new operations on the circuit before it
212          * becomes illegal and panics.
213          */
214         if (state->iocom->auto_callback)
215                 state->iocom->auto_callback(msg);
216
217         if ((state->txcmd & DMSGF_DELETE) == 0 &&
218             (msg->any.head.cmd & DMSGF_DELETE)) {
219                 kdmsg_msg_reply(msg, 0);
220         }
221         return (0);
222 }
223
224 /*
225  * Disconnect and clean up
226  */
227 void
228 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
229 {
230         kdmsg_state_t *state;
231
232         /*
233          * Ask the cluster controller to go away
234          */
235         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
236         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
237
238         while (iocom->msgrd_td || iocom->msgwr_td) {
239                 wakeup(&iocom->msg_ctl);
240                 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
241         }
242
243         /*
244          * Cleanup caches
245          */
246         if ((state = iocom->freerd_state) != NULL) {
247                 iocom->freerd_state = NULL;
248                 kdmsg_state_free(state);
249         }
250
251         if ((state = iocom->freewr_state) != NULL) {
252                 iocom->freewr_state = NULL;
253                 kdmsg_state_free(state);
254         }
255
256         /*
257          * Drop communications descriptor
258          */
259         if (iocom->msg_fp) {
260                 fdrop(iocom->msg_fp);
261                 iocom->msg_fp = NULL;
262         }
263         lockmgr(&iocom->msglk, LK_RELEASE);
264 }
265
266 /*
267  * Cluster controller thread.  Perform messaging functions.  We have one
268  * thread for the reader and one for the writer.  The writer handles
269  * shutdown requests (which should break the reader thread).
270  */
271 static
272 void
273 kdmsg_iocom_thread_rd(void *arg)
274 {
275         kdmsg_iocom_t *iocom = arg;
276         dmsg_hdr_t hdr;
277         kdmsg_msg_t *msg = NULL;
278         size_t hbytes;
279         size_t abytes;
280         int error = 0;
281
282         while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILL) == 0) {
283                 /*
284                  * Retrieve the message from the pipe or socket.
285                  */
286                 error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
287                                 NULL, 1, UIO_SYSSPACE);
288                 if (error)
289                         break;
290                 if (hdr.magic != DMSG_HDR_MAGIC) {
291                         kprintf("kdmsg: bad magic: %04x\n", hdr.magic);
292                         error = EINVAL;
293                         break;
294                 }
295                 hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
296                 if (hbytes < sizeof(hdr) || hbytes > DMSG_AUX_MAX) {
297                         kprintf("kdmsg: bad header size %zd\n", hbytes);
298                         error = EINVAL;
299                         break;
300                 }
301
302                 /* XXX messy: mask cmd to avoid allocating state */
303                 msg = kdmsg_msg_alloc(&iocom->state0,
304                                       hdr.cmd & DMSGF_BASECMDMASK,
305                                       NULL, NULL);
306                 msg->any.head = hdr;
307                 msg->hdr_size = hbytes;
308                 if (hbytes > sizeof(hdr)) {
309                         error = fp_read(iocom->msg_fp, &msg->any.head + 1,
310                                         hbytes - sizeof(hdr),
311                                         NULL, 1, UIO_SYSSPACE);
312                         if (error) {
313                                 kprintf("kdmsg: short msg received\n");
314                                 error = EINVAL;
315                                 break;
316                         }
317                 }
318                 msg->aux_size = hdr.aux_bytes;
319                 if (msg->aux_size > DMSG_AUX_MAX) {
320                         kprintf("kdmsg: illegal msg payload size %zd\n",
321                                 msg->aux_size);
322                         error = EINVAL;
323                         break;
324                 }
325                 if (msg->aux_size) {
326                         abytes = DMSG_DOALIGN(msg->aux_size);
327                         msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
328                         msg->flags |= KDMSG_FLAG_AUXALLOC;
329                         error = fp_read(iocom->msg_fp, msg->aux_data,
330                                         abytes, NULL, 1, UIO_SYSSPACE);
331                         if (error) {
332                                 kprintf("kdmsg: short msg payload received\n");
333                                 break;
334                         }
335                 }
336
337                 error = kdmsg_msg_receive_handling(msg);
338                 msg = NULL;
339         }
340
341         if (error)
342                 kprintf("kdmsg: read failed error %d\n", error);
343
344         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
345         if (msg)
346                 kdmsg_msg_free(msg);
347
348         /*
349          * Shutdown the socket before waiting for the transmit side.
350          *
351          * If we are dying due to e.g. a socket disconnect verses being
352          * killed explicity we have to set KILL in order to kick the tx
353          * side when it might not have any other work to do.  KILL might
354          * already be set if we are in an unmount or reconnect.
355          */
356         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
357
358         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
359         wakeup(&iocom->msg_ctl);
360
361         /*
362          * Wait for the transmit side to drain remaining messages
363          * before cleaning up the rx state.  The transmit side will
364          * set KILLTX and wait for the rx side to completely finish
365          * (set msgrd_td to NULL) before cleaning up any remaining
366          * tx states.
367          */
368         lockmgr(&iocom->msglk, LK_RELEASE);
369         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
370         wakeup(&iocom->msg_ctl);
371         while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0) {
372                 wakeup(&iocom->msg_ctl);
373                 tsleep(iocom, 0, "clstrkw", hz);
374         }
375
376         iocom->msgrd_td = NULL;
377
378         /*
379          * iocom can be ripped out from under us at this point but
380          * wakeup() is safe.
381          */
382         wakeup(iocom);
383         lwkt_exit();
384 }
385
386 static
387 void
388 kdmsg_iocom_thread_wr(void *arg)
389 {
390         kdmsg_iocom_t *iocom = arg;
391         kdmsg_msg_t *msg;
392         kdmsg_state_t *state;
393         ssize_t res;
394         size_t abytes;
395         int error = 0;
396         int retries = 20;
397
398         /*
399          * Transmit loop
400          */
401         msg = NULL;
402         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
403
404         while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILL) == 0 && error == 0) {
405                 /*
406                  * Sleep if no messages pending.  Interlock with flag while
407                  * holding msglk.
408                  */
409                 if (TAILQ_EMPTY(&iocom->msgq)) {
410                         atomic_set_int(&iocom->msg_ctl,
411                                        KDMSG_CLUSTERCTL_SLEEPING);
412                         lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
413                         atomic_clear_int(&iocom->msg_ctl,
414                                          KDMSG_CLUSTERCTL_SLEEPING);
415                 }
416
417                 while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
418                         /*
419                          * Remove msg from the transmit queue and do
420                          * persist and half-closed state handling.
421                          */
422                         TAILQ_REMOVE(&iocom->msgq, msg, qentry);
423                         lockmgr(&iocom->msglk, LK_RELEASE);
424
425                         error = kdmsg_state_msgtx(msg);
426                         if (error == EALREADY) {
427                                 error = 0;
428                                 kdmsg_msg_free(msg);
429                                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
430                                 continue;
431                         }
432                         if (error) {
433                                 kdmsg_msg_free(msg);
434                                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
435                                 break;
436                         }
437
438                         /*
439                          * Dump the message to the pipe or socket.
440                          *
441                          * We have to clean up the message as if the transmit
442                          * succeeded even if it failed.
443                          */
444                         error = fp_write(iocom->msg_fp, &msg->any,
445                                          msg->hdr_size, &res, UIO_SYSSPACE);
446                         if (error || res != msg->hdr_size) {
447                                 if (error == 0)
448                                         error = EINVAL;
449                                 kdmsg_state_cleanuptx(msg);
450                                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
451                                 break;
452                         }
453                         if (msg->aux_size) {
454                                 abytes = DMSG_DOALIGN(msg->aux_size);
455                                 error = fp_write(iocom->msg_fp,
456                                                  msg->aux_data, abytes,
457                                                  &res, UIO_SYSSPACE);
458                                 if (error || res != abytes) {
459                                         if (error == 0)
460                                                 error = EINVAL;
461                                         kdmsg_state_cleanuptx(msg);
462                                         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
463                                         break;
464                                 }
465                         }
466                         kdmsg_state_cleanuptx(msg);
467                         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
468                 }
469         }
470
471         /*
472          * Cleanup messages pending transmission and release msgq lock.
473          */
474         if (error)
475                 kprintf("kdmsg: write failed error %d\n", error);
476         kprintf("thread_wr: Terminating iocom\n");
477
478         /*
479          * Shutdown the socket.  This will cause the rx thread to get an
480          * EOF and ensure that both threads get to a termination state.
481          */
482         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
483
484         /*
485          * Set KILLTX (which the rx side waits for), then wait for the RX
486          * side to completely finish before we clean out any remaining
487          * command states.
488          */
489         lockmgr(&iocom->msglk, LK_RELEASE);
490         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLTX);
491         wakeup(&iocom->msg_ctl);
492         while (iocom->msgrd_td) {
493                 wakeup(&iocom->msg_ctl);
494                 tsleep(iocom, 0, "clstrkw", hz);
495         }
496         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
497
498         /*
499          * Simulate received MSGF_DELETE's for any remaining states.
500          * (For remote masters).
501          *
502          * Drain the message queue to handle any device initiated writes
503          * due to state callbacks.
504          */
505 cleanuprd:
506         RB_FOREACH(state, kdmsg_state_tree, &iocom->staterd_tree)
507                 atomic_set_int(&state->flags, KDMSG_STATE_DYING);
508         RB_FOREACH(state, kdmsg_state_tree, &iocom->statewr_tree)
509                 atomic_set_int(&state->flags, KDMSG_STATE_DYING);
510         kdmsg_drain_msgq(iocom);
511         RB_FOREACH(state, kdmsg_state_tree, &iocom->staterd_tree) {
512                 if ((state->rxcmd & DMSGF_DELETE) == 0) {
513                         lockmgr(&iocom->msglk, LK_RELEASE);
514                         kdmsg_state_abort(state);
515                         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
516                         goto cleanuprd;
517                 }
518         }
519
520         /*
521          * Simulate received MSGF_DELETE's for any remaining states.
522          * (For local masters).
523          */
524         kdmsg_drain_msgq(iocom);
525         RB_FOREACH(state, kdmsg_state_tree, &iocom->statewr_tree) {
526                 if ((state->rxcmd & DMSGF_DELETE) == 0) {
527                         lockmgr(&iocom->msglk, LK_RELEASE);
528                         kdmsg_state_abort(state);
529                         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
530                         goto cleanuprd;
531                 }
532         }
533
534         /*
535          * Retry until all work is done
536          */
537         if (--retries == 0)
538                 panic("kdmsg: comm thread shutdown couldn't drain");
539         if (TAILQ_FIRST(&iocom->msgq) ||
540             RB_ROOT(&iocom->staterd_tree) ||
541             RB_ROOT(&iocom->statewr_tree)) {
542                 goto cleanuprd;
543         }
544         iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
545
546         lockmgr(&iocom->msglk, LK_RELEASE);
547
548         /*
549          * The state trees had better be empty now
550          */
551         KKASSERT(RB_EMPTY(&iocom->staterd_tree));
552         KKASSERT(RB_EMPTY(&iocom->statewr_tree));
553         KKASSERT(iocom->conn_state == NULL);
554
555         if (iocom->exit_func) {
556                 /*
557                  * iocom is invalid after we call the exit function.
558                  */
559                 iocom->msgwr_td = NULL;
560                 iocom->exit_func(iocom);
561         } else {
562                 /*
563                  * iocom can be ripped out from under us once msgwr_td is
564                  * set to NULL.  The wakeup is safe.
565                  */
566                 iocom->msgwr_td = NULL;
567                 wakeup(iocom);
568         }
569         lwkt_exit();
570 }
571
572 /*
573  * This cleans out the pending transmit message queue, adjusting any
574  * persistent states properly in the process.
575  *
576  * Caller must hold pmp->iocom.msglk
577  */
578 void
579 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
580 {
581         kdmsg_msg_t *msg;
582
583         /*
584          * Clean out our pending transmit queue, executing the
585          * appropriate state adjustments.  If this tries to open
586          * any new outgoing transactions we have to loop up and
587          * clean them out.
588          */
589         while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
590                 TAILQ_REMOVE(&iocom->msgq, msg, qentry);
591                 lockmgr(&iocom->msglk, LK_RELEASE);
592                 if (kdmsg_state_msgtx(msg))
593                         kdmsg_msg_free(msg);
594                 else
595                         kdmsg_state_cleanuptx(msg);
596                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
597         }
598 }
599
600 /*
601  * Do all processing required to handle a freshly received message
602  * after its low level header has been validated.
603  */
604 static
605 int
606 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
607 {
608         kdmsg_iocom_t *iocom = msg->state->iocom;
609         int error;
610
611         /*
612          * State machine tracking, state assignment for msg,
613          * returns error and discard status.  Errors are fatal
614          * to the connection except for EALREADY which forces
615          * a discard without execution.
616          */
617         error = kdmsg_state_msgrx(msg);
618         if (error) {
619                 /*
620                  * Raw protocol or connection error
621                  */
622                 kdmsg_msg_free(msg);
623                 if (error == EALREADY)
624                         error = 0;
625         } else if (msg->state && msg->state->func) {
626                 /*
627                  * Message related to state which already has a
628                  * handling function installed for it.
629                  */
630                 error = msg->state->func(msg->state, msg);
631                 kdmsg_state_cleanuprx(msg);
632         } else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
633                 error = kdmsg_autorxmsg(msg);
634                 kdmsg_state_cleanuprx(msg);
635         } else {
636                 error = iocom->rcvmsg(msg);
637                 kdmsg_state_cleanuprx(msg);
638         }
639         return error;
640 }
641
642 /*
643  * Process state tracking for a message after reception, prior to
644  * execution.
645  *
646  * Called with msglk held and the msg dequeued.
647  *
648  * All messages are called with dummy state and return actual state.
649  * (One-off messages often just return the same dummy state).
650  *
651  * May request that caller discard the message by setting *discardp to 1.
652  * The returned state is not used in this case and is allowed to be NULL.
653  *
654  * --
655  *
656  * These routines handle persistent and command/reply message state via the
657  * CREATE and DELETE flags.  The first message in a command or reply sequence
658  * sets CREATE, the last message in a command or reply sequence sets DELETE.
659  *
660  * There can be any number of intermediate messages belonging to the same
661  * sequence sent inbetween the CREATE message and the DELETE message,
662  * which set neither flag.  This represents a streaming command or reply.
663  *
664  * Any command message received with CREATE set expects a reply sequence to
665  * be returned.  Reply sequences work the same as command sequences except the
666  * REPLY bit is also sent.  Both the command side and reply side can
667  * degenerate into a single message with both CREATE and DELETE set.  Note
668  * that one side can be streaming and the other side not, or neither, or both.
669  *
670  * The msgid is unique for the initiator.  That is, two sides sending a new
671  * message can use the same msgid without colliding.
672  *
673  * --
674  *
675  * ABORT sequences work by setting the ABORT flag along with normal message
676  * state.  However, ABORTs can also be sent on half-closed messages, that is
677  * even if the command or reply side has already sent a DELETE, as long as
678  * the message has not been fully closed it can still send an ABORT+DELETE
679  * to terminate the half-closed message state.
680  *
681  * Since ABORT+DELETEs can race we silently discard ABORT's for message
682  * state which has already been fully closed.  REPLY+ABORT+DELETEs can
683  * also race, and in this situation the other side might have already
684  * initiated a new unrelated command with the same message id.  Since
685  * the abort has not set the CREATE flag the situation can be detected
686  * and the message will also be discarded.
687  *
688  * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
689  * The ABORT request is essentially integrated into the command instead
690  * of being sent later on.  In this situation the command implementation
691  * detects that CREATE and ABORT are both set (vs ABORT alone) and can
692  * special-case non-blocking operation for the command.
693  *
694  * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
695  *        to be mid-stream aborts for command/reply sequences.  ABORTs on
696  *        one-way messages are not supported.
697  *
698  * NOTE!  If a command sequence does not support aborts the ABORT flag is
699  *        simply ignored.
700  *
701  * --
702  *
703  * One-off messages (no reply expected) are sent with neither CREATE or DELETE
704  * set.  One-off messages cannot be aborted and typically aren't processed
705  * by these routines.  The REPLY bit can be used to distinguish whether a
706  * one-off message is a command or reply.  For example, one-off replies
707  * will typically just contain status updates.
708  */
709 static
710 int
711 kdmsg_state_msgrx(kdmsg_msg_t *msg)
712 {
713         kdmsg_iocom_t *iocom = msg->state->iocom;
714         kdmsg_state_t *state;
715         kdmsg_state_t *pstate;
716         kdmsg_state_t sdummy;
717         int error;
718
719         /*
720          * Make sure a state structure is ready to go in case we need a new
721          * one.  This is the only routine which uses freerd_state so no
722          * races are possible.
723          */
724         if ((state = iocom->freerd_state) == NULL) {
725                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
726                 state->flags = KDMSG_STATE_DYNAMIC;
727                 state->iocom = iocom;
728                 TAILQ_INIT(&state->subq);
729                 iocom->freerd_state = state;
730         }
731
732         /*
733          * Lock RB tree and locate existing persistent state, if any.
734          *
735          * If received msg is a command state is on staterd_tree.
736          * If received msg is a reply state is on statewr_tree.
737          */
738         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
739
740         sdummy.msgid = msg->any.head.msgid;
741         sdummy.iocom = iocom;
742         if (msg->any.head.cmd & DMSGF_REVTRANS) {
743                 state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree,
744                                 &sdummy);
745         } else {
746                 state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree,
747                                 &sdummy);
748         }
749         if (state == NULL)
750                 state = &iocom->state0;
751         msg->state = state;
752
753         /*
754          * Short-cut one-off or mid-stream messages.
755          */
756         if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
757                                   DMSGF_ABORT)) == 0) {
758                 error = 0;
759                 goto done;
760         }
761
762         /*
763          * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
764          * inside the case statements.
765          */
766         switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
767         case DMSGF_CREATE:
768         case DMSGF_CREATE | DMSGF_DELETE:
769                 /*
770                  * New persistant command received.
771                  */
772                 if (state != &iocom->state0) {
773                         kprintf("kdmsg_state_msgrx: duplicate transaction\n");
774                         error = EINVAL;
775                         break;
776                 }
777
778                 /*
779                  * Lookup the circuit.  The circuit is an open transaction.
780                  * the REVCIRC bit in the message tells us which side
781                  * initiated the transaction representing the circuit.
782                  */
783                 if (msg->any.head.circuit) {
784                         sdummy.msgid = msg->any.head.circuit;
785
786                         if (msg->any.head.cmd & DMSGF_REVCIRC) {
787                                 pstate = RB_FIND(kdmsg_state_tree,
788                                                  &iocom->statewr_tree,
789                                                  &sdummy);
790                         } else {
791                                 pstate = RB_FIND(kdmsg_state_tree,
792                                                  &iocom->staterd_tree,
793                                                  &sdummy);
794                         }
795                         if (pstate == NULL) {
796                                 kprintf("kdmsg_state_msgrx: "
797                                         "missing parent in stacked trans\n");
798                                 error = EINVAL;
799                                 break;
800                         }
801                 } else {
802                         pstate = &iocom->state0;
803                 }
804
805                 /*
806                  * Allocate new state
807                  */
808                 state = iocom->freerd_state;
809                 iocom->freerd_state = NULL;
810
811                 msg->state = state;
812                 state->parent = pstate;
813                 KKASSERT(state->iocom == iocom);
814                 state->flags |= KDMSG_STATE_INSERTED |
815                                 KDMSG_STATE_OPPOSITE;
816                 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
817                 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
818                 state->txcmd = DMSGF_REPLY;
819                 state->msgid = msg->any.head.msgid;
820                 RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
821                 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
822                 error = 0;
823                 break;
824         case DMSGF_DELETE:
825                 /*
826                  * Persistent state is expected but might not exist if an
827                  * ABORT+DELETE races the close.
828                  */
829                 if (state == &iocom->state0) {
830                         if (msg->any.head.cmd & DMSGF_ABORT) {
831                                 error = EALREADY;
832                         } else {
833                                 kprintf("kdmsg_state_msgrx: "
834                                         "no state for DELETE\n");
835                                 error = EINVAL;
836                         }
837                         break;
838                 }
839
840                 /*
841                  * Handle another ABORT+DELETE case if the msgid has already
842                  * been reused.
843                  */
844                 if ((state->rxcmd & DMSGF_CREATE) == 0) {
845                         if (msg->any.head.cmd & DMSGF_ABORT) {
846                                 error = EALREADY;
847                         } else {
848                                 kprintf("kdmsg_state_msgrx: "
849                                         "state reused for DELETE\n");
850                                 error = EINVAL;
851                         }
852                         break;
853                 }
854                 error = 0;
855                 break;
856         default:
857                 /*
858                  * Check for mid-stream ABORT command received, otherwise
859                  * allow.
860                  */
861                 if (msg->any.head.cmd & DMSGF_ABORT) {
862                         if (state == &iocom->state0 ||
863                             (state->rxcmd & DMSGF_CREATE) == 0) {
864                                 error = EALREADY;
865                                 break;
866                         }
867                 }
868                 error = 0;
869                 break;
870         case DMSGF_REPLY | DMSGF_CREATE:
871         case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
872                 /*
873                  * When receiving a reply with CREATE set the original
874                  * persistent state message should already exist.
875                  */
876                 if (state == &iocom->state0) {
877                         kprintf("kdmsg_state_msgrx: no state match for "
878                                 "REPLY cmd=%08x msgid=%016jx\n",
879                                 msg->any.head.cmd,
880                                 (intmax_t)msg->any.head.msgid);
881                         error = EINVAL;
882                         break;
883                 }
884                 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
885                 error = 0;
886                 break;
887         case DMSGF_REPLY | DMSGF_DELETE:
888                 /*
889                  * Received REPLY+ABORT+DELETE in case where msgid has
890                  * already been fully closed, ignore the message.
891                  */
892                 if (state == &iocom->state0) {
893                         if (msg->any.head.cmd & DMSGF_ABORT) {
894                                 error = EALREADY;
895                         } else {
896                                 kprintf("kdmsg_state_msgrx: no state match "
897                                         "for REPLY|DELETE\n");
898                                 error = EINVAL;
899                         }
900                         break;
901                 }
902
903                 /*
904                  * Received REPLY+ABORT+DELETE in case where msgid has
905                  * already been reused for an unrelated message,
906                  * ignore the message.
907                  */
908                 if ((state->rxcmd & DMSGF_CREATE) == 0) {
909                         if (msg->any.head.cmd & DMSGF_ABORT) {
910                                 error = EALREADY;
911                         } else {
912                                 kprintf("kdmsg_state_msgrx: state reused "
913                                         "for REPLY|DELETE\n");
914                                 error = EINVAL;
915                         }
916                         break;
917                 }
918                 error = 0;
919                 break;
920         case DMSGF_REPLY:
921                 /*
922                  * Check for mid-stream ABORT reply received to sent command.
923                  */
924                 if (msg->any.head.cmd & DMSGF_ABORT) {
925                         if (state == &iocom->state0 ||
926                             (state->rxcmd & DMSGF_CREATE) == 0) {
927                                 error = EALREADY;
928                                 break;
929                         }
930                 }
931                 error = 0;
932                 break;
933         }
934
935         /*
936          * Calculate the easy-switch() transactional command.  Represents
937          * the outer-transaction command for any transaction-create or
938          * transaction-delete, and the inner message command for any
939          * non-transaction or inside-transaction command.  tcmd will be
940          * set to 0 if the message state is illegal.
941          *
942          * The two can be told apart because outer-transaction commands
943          * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
944          */
945 done:
946         lockmgr(&iocom->msglk, LK_RELEASE);
947
948         if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
949                 if (state != &iocom->state0) {
950                         msg->tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
951                                     (msg->any.head.cmd & (DMSGF_CREATE |
952                                                           DMSGF_DELETE |
953                                                           DMSGF_REPLY));
954                 } else {
955                         msg->tcmd = 0;
956                 }
957         } else {
958                 msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
959         }
960         return (error);
961 }
962
963 /*
964  * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
965  * This routine must call iocom->rcvmsg() for anything not automatically
966  * handled.
967  */
968 static int
969 kdmsg_autorxmsg(kdmsg_msg_t *msg)
970 {
971         kdmsg_iocom_t *iocom = msg->state->iocom;
972         int error = 0;
973         uint32_t cmd;
974
975         /*
976          * Main switch processes transaction create/delete sequences only.
977          * Use icmd (DELETEs use DMSG_LNK_ERROR
978          *
979          * NOTE: If processing in-transaction messages you generally want
980          *       an inner switch on msg->any.head.cmd.
981          */
982         if (msg->state) {
983                 cmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
984                       (msg->any.head.cmd & (DMSGF_CREATE |
985                                             DMSGF_DELETE |
986                                             DMSGF_REPLY));
987         } else {
988                 cmd = 0;
989         }
990
991         switch(cmd) {
992         case DMSG_LNK_CONN | DMSGF_CREATE:
993         case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
994                 /*
995                  * Received LNK_CONN transaction.  Transmit response and
996                  * leave transaction open, which allows the other end to
997                  * start to the SPAN protocol.
998                  *
999                  * Handle shim after acknowledging the CONN.
1000                  */
1001                 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1002                         if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1003                                 kdmsg_msg_result(msg, 0);
1004                                 if (iocom->auto_callback)
1005                                         iocom->auto_callback(msg);
1006                         } else {
1007                                 error = iocom->rcvmsg(msg);
1008                         }
1009                         break;
1010                 }
1011                 /* fall through */
1012         case DMSG_LNK_CONN | DMSGF_DELETE:
1013                 /*
1014                  * This message is usually simulated after a link is lost
1015                  * to clean up the transaction.
1016                  */
1017                 if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1018                         if (iocom->auto_callback)
1019                                 iocom->auto_callback(msg);
1020                         kdmsg_msg_reply(msg, 0);
1021                 } else {
1022                         error = iocom->rcvmsg(msg);
1023                 }
1024                 break;
1025         case DMSG_LNK_SPAN | DMSGF_CREATE:
1026         case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1027                 /*
1028                  * Received LNK_SPAN transaction.  We do not have to respond
1029                  * (except on termination), but we must leave the transaction
1030                  * open.
1031                  *
1032                  * Handle shim after acknowledging the SPAN.
1033                  */
1034                 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1035                         if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1036                                 if (iocom->auto_callback)
1037                                         iocom->auto_callback(msg);
1038                                 break;
1039                         }
1040                         /* fall through */
1041                 } else {
1042                         error = iocom->rcvmsg(msg);
1043                         break;
1044                 }
1045                 /* fall through */
1046         case DMSG_LNK_SPAN | DMSGF_DELETE:
1047                 /*
1048                  * Process shims (auto_callback) before cleaning up the
1049                  * circuit structure and closing the transactions.  Device
1050                  * driver should ensure that the circuit is not used after
1051                  * the auto_callback() returns.
1052                  *
1053                  * Handle shim before closing the SPAN transaction.
1054                  */
1055                 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1056                         if (iocom->auto_callback)
1057                                 iocom->auto_callback(msg);
1058                         kdmsg_msg_reply(msg, 0);
1059                 } else {
1060                         error = iocom->rcvmsg(msg);
1061                 }
1062                 break;
1063         default:
1064                 /*
1065                  * Anything unhandled goes into rcvmsg.
1066                  *
1067                  * NOTE: Replies to link-level messages initiated by our side
1068                  *       are handled by the state callback, they are NOT
1069                  *       handled here.
1070                  */
1071                 error = iocom->rcvmsg(msg);
1072                 break;
1073         }
1074         return (error);
1075 }
1076
1077 /*
1078  * Post-receive-handling message and state cleanup.  This routine is called
1079  * after the state function handling/callback to properly dispose of the
1080  * message and update or dispose of the state.
1081  */
1082 static
1083 void
1084 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1085 {
1086         kdmsg_iocom_t *iocom = msg->state->iocom;
1087         kdmsg_state_t *state;
1088         kdmsg_state_t *pstate;
1089
1090         if ((state = msg->state) == NULL) {
1091                 kdmsg_msg_free(msg);
1092         } else if (msg->any.head.cmd & DMSGF_DELETE) {
1093                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1094                 KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1095                 state->rxcmd |= DMSGF_DELETE;
1096                 if (state->txcmd & DMSGF_DELETE) {
1097                         KKASSERT(state->flags & KDMSG_STATE_INSERTED);
1098                         if (state->rxcmd & DMSGF_REPLY) {
1099                                 KKASSERT(msg->any.head.cmd &
1100                                          DMSGF_REPLY);
1101                                 RB_REMOVE(kdmsg_state_tree,
1102                                           &iocom->statewr_tree, state);
1103                         } else {
1104                                 KKASSERT((msg->any.head.cmd &
1105                                           DMSGF_REPLY) == 0);
1106                                 RB_REMOVE(kdmsg_state_tree,
1107                                           &iocom->staterd_tree, state);
1108                         }
1109                         pstate = state->parent;
1110                         TAILQ_REMOVE(&pstate->subq, state, entry);
1111                         if (pstate != &pstate->iocom->state0 &&
1112                             TAILQ_EMPTY(&pstate->subq) &&
1113                             (pstate->flags & KDMSG_STATE_INSERTED) == 0) {
1114                                 kdmsg_state_free(pstate);
1115                         }
1116                         state->flags &= ~KDMSG_STATE_INSERTED;
1117                         state->parent = NULL;
1118                         kdmsg_msg_free(msg);
1119                         if (TAILQ_EMPTY(&state->subq))
1120                                 kdmsg_state_free(state);
1121                         lockmgr(&iocom->msglk, LK_RELEASE);
1122                 } else {
1123                         kdmsg_msg_free(msg);
1124                         lockmgr(&iocom->msglk, LK_RELEASE);
1125                 }
1126         } else {
1127                 kdmsg_msg_free(msg);
1128         }
1129 }
1130
1131 /*
1132  * Simulate receiving a message which terminates an active transaction
1133  * state.  Our simulated received message must set DELETE and may also
1134  * have to set CREATE.  It must also ensure that all fields are set such
1135  * that the receive handling code can find the state (kdmsg_state_msgrx())
1136  * or an endless loop will ensue.
1137  *
1138  * This is used when the other end of the link is dead so the device driver
1139  * gets a completed transaction for all pending states.
1140  */
1141 static
1142 void
1143 kdmsg_state_abort(kdmsg_state_t *state)
1144 {
1145         kdmsg_msg_t *msg;
1146
1147         /*
1148          * Prevent recursive aborts which could otherwise occur if the
1149          * simulated message reception runs state->func which then turns
1150          * around and tries to reply to a broken circuit when then calls
1151          * the state abort code again.
1152          */
1153         if (state->flags & KDMSG_STATE_ABORTING)
1154                 return;
1155         state->flags |= KDMSG_STATE_ABORTING;
1156
1157         /*
1158          * NOTE: Args to kdmsg_msg_alloc() to avoid dynamic state allocation.
1159          *
1160          * NOTE: We are simulating a received message using our state
1161          *       (vs a message generated by the other side using its state),
1162          *       so we must invert DMSGF_REVTRANS and DMSGF_REVCIRC.
1163          */
1164         msg = kdmsg_msg_alloc(state, DMSG_LNK_ERROR, NULL, NULL);
1165         if ((state->rxcmd & DMSGF_CREATE) == 0)
1166                 msg->any.head.cmd |= DMSGF_CREATE;
1167         msg->any.head.cmd |= DMSGF_DELETE | (state->rxcmd & DMSGF_REPLY);
1168         msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
1169         msg->any.head.error = DMSG_ERR_LOSTLINK;
1170         kdmsg_msg_receive_handling(msg);
1171 }
1172
1173 /*
1174  * Process state tracking for a message prior to transmission.
1175  *
1176  * Called with msglk held and the msg dequeued.  Returns non-zero if
1177  * the message is bad and should be deleted by the caller.
1178  *
1179  * One-off messages are usually with dummy state and msg->state may be NULL
1180  * in this situation.
1181  *
1182  * New transactions (when CREATE is set) will insert the state.
1183  *
1184  * May request that caller discard the message by setting *discardp to 1.
1185  * A NULL state may be returned in this case.
1186  */
1187 static
1188 int
1189 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1190 {
1191         kdmsg_iocom_t *iocom = msg->state->iocom;
1192         kdmsg_state_t *state;
1193         int error;
1194
1195         /*
1196          * Make sure a state structure is ready to go in case we need a new
1197          * one.  This is the only routine which uses freewr_state so no
1198          * races are possible.
1199          */
1200         if ((state = iocom->freewr_state) == NULL) {
1201                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1202                 state->flags = KDMSG_STATE_DYNAMIC;
1203                 state->iocom = iocom;
1204                 iocom->freewr_state = state;
1205         }
1206
1207         /*
1208          * Lock RB tree.  If persistent state is present it will have already
1209          * been assigned to msg.
1210          */
1211         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1212         state = msg->state;
1213
1214         /*
1215          * Short-cut one-off or mid-stream messages (state may be NULL).
1216          */
1217         if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1218                                   DMSGF_ABORT)) == 0) {
1219                 lockmgr(&iocom->msglk, LK_RELEASE);
1220                 return(0);
1221         }
1222
1223
1224         /*
1225          * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1226          * inside the case statements.
1227          */
1228         switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1229                                     DMSGF_REPLY)) {
1230         case DMSGF_CREATE:
1231         case DMSGF_CREATE | DMSGF_DELETE:
1232                 /*
1233                  * Insert the new persistent message state and mark
1234                  * half-closed if DELETE is set.  Since this is a new
1235                  * message it isn't possible to transition into the fully
1236                  * closed state here.
1237                  *
1238                  * XXX state must be assigned and inserted by
1239                  *     kdmsg_msg_write().  txcmd is assigned by us
1240                  *     on-transmit.
1241                  */
1242                 KKASSERT(state != NULL);
1243                 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1244                 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1245                 state->rxcmd = DMSGF_REPLY;
1246                 error = 0;
1247                 break;
1248         case DMSGF_DELETE:
1249                 /*
1250                  * Sent ABORT+DELETE in case where msgid has already
1251                  * been fully closed, ignore the message.
1252                  */
1253                 if (state == &iocom->state0) {
1254                         if (msg->any.head.cmd & DMSGF_ABORT) {
1255                                 error = EALREADY;
1256                         } else {
1257                                 kprintf("kdmsg_state_msgtx: no state match "
1258                                         "for DELETE cmd=%08x msgid=%016jx\n",
1259                                         msg->any.head.cmd,
1260                                         (intmax_t)msg->any.head.msgid);
1261                                 error = EINVAL;
1262                         }
1263                         break;
1264                 }
1265
1266                 /*
1267                  * Sent ABORT+DELETE in case where msgid has
1268                  * already been reused for an unrelated message,
1269                  * ignore the message.
1270                  */
1271                 if ((state->txcmd & DMSGF_CREATE) == 0) {
1272                         if (msg->any.head.cmd & DMSGF_ABORT) {
1273                                 error = EALREADY;
1274                         } else {
1275                                 kprintf("kdmsg_state_msgtx: state reused "
1276                                         "for DELETE\n");
1277                                 error = EINVAL;
1278                         }
1279                         break;
1280                 }
1281                 error = 0;
1282                 break;
1283         default:
1284                 /*
1285                  * Check for mid-stream ABORT command sent
1286                  */
1287                 if (msg->any.head.cmd & DMSGF_ABORT) {
1288                         if (state == &state->iocom->state0 ||
1289                             (state->txcmd & DMSGF_CREATE) == 0) {
1290                                 error = EALREADY;
1291                                 break;
1292                         }
1293                 }
1294                 error = 0;
1295                 break;
1296         case DMSGF_REPLY | DMSGF_CREATE:
1297         case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1298                 /*
1299                  * When transmitting a reply with CREATE set the original
1300                  * persistent state message should already exist.
1301                  */
1302                 if (state == &state->iocom->state0) {
1303                         kprintf("kdmsg_state_msgtx: no state match "
1304                                 "for REPLY | CREATE\n");
1305                         error = EINVAL;
1306                         break;
1307                 }
1308                 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1309                 error = 0;
1310                 break;
1311         case DMSGF_REPLY | DMSGF_DELETE:
1312                 /*
1313                  * When transmitting a reply with DELETE set the original
1314                  * persistent state message should already exist.
1315                  *
1316                  * This is very similar to the REPLY|CREATE|* case except
1317                  * txcmd is already stored, so we just add the DELETE flag.
1318                  *
1319                  * Sent REPLY+ABORT+DELETE in case where msgid has
1320                  * already been fully closed, ignore the message.
1321                  */
1322                 if (state == &state->iocom->state0) {
1323                         if (msg->any.head.cmd & DMSGF_ABORT) {
1324                                 error = EALREADY;
1325                         } else {
1326                                 kprintf("kdmsg_state_msgtx: no state match "
1327                                         "for REPLY | DELETE\n");
1328                                 error = EINVAL;
1329                         }
1330                         break;
1331                 }
1332
1333                 /*
1334                  * Sent REPLY+ABORT+DELETE in case where msgid has already
1335                  * been reused for an unrelated message, ignore the message.
1336                  */
1337                 if ((state->txcmd & DMSGF_CREATE) == 0) {
1338                         if (msg->any.head.cmd & DMSGF_ABORT) {
1339                                 error = EALREADY;
1340                         } else {
1341                                 kprintf("kdmsg_state_msgtx: state reused "
1342                                         "for REPLY | DELETE\n");
1343                                 error = EINVAL;
1344                         }
1345                         break;
1346                 }
1347                 error = 0;
1348                 break;
1349         case DMSGF_REPLY:
1350                 /*
1351                  * Check for mid-stream ABORT reply sent.
1352                  *
1353                  * One-off REPLY messages are allowed for e.g. status updates.
1354                  */
1355                 if (msg->any.head.cmd & DMSGF_ABORT) {
1356                         if (state == &state->iocom->state0 ||
1357                             (state->txcmd & DMSGF_CREATE) == 0) {
1358                                 error = EALREADY;
1359                                 break;
1360                         }
1361                 }
1362                 error = 0;
1363                 break;
1364         }
1365         lockmgr(&iocom->msglk, LK_RELEASE);
1366         return (error);
1367 }
1368
1369 static
1370 void
1371 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1372 {
1373         kdmsg_iocom_t *iocom = msg->state->iocom;
1374         kdmsg_state_t *state;
1375         kdmsg_state_t *pstate;
1376
1377         if ((state = msg->state) == NULL) {
1378                 kdmsg_msg_free(msg);
1379         } else if (msg->any.head.cmd & DMSGF_DELETE) {
1380                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1381                 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1382                 state->txcmd |= DMSGF_DELETE;
1383                 if (state->rxcmd & DMSGF_DELETE) {
1384                         KKASSERT(state->flags & KDMSG_STATE_INSERTED);
1385                         if (state->txcmd & DMSGF_REPLY) {
1386                                 KKASSERT(msg->any.head.cmd &
1387                                          DMSGF_REPLY);
1388                                 RB_REMOVE(kdmsg_state_tree,
1389                                           &iocom->staterd_tree, state);
1390                         } else {
1391                                 KKASSERT((msg->any.head.cmd &
1392                                           DMSGF_REPLY) == 0);
1393                                 RB_REMOVE(kdmsg_state_tree,
1394                                           &iocom->statewr_tree, state);
1395                         }
1396                         pstate = state->parent;
1397                         TAILQ_REMOVE(&pstate->subq, state, entry);
1398                         if (pstate != &pstate->iocom->state0 &&
1399                             TAILQ_EMPTY(&pstate->subq) &&
1400                             (pstate->flags & KDMSG_STATE_INSERTED) == 0) {
1401                                 kdmsg_state_free(pstate);
1402                         }
1403                         state->flags &= ~KDMSG_STATE_INSERTED;
1404                         state->parent = NULL;
1405                         kdmsg_msg_free(msg);
1406                         if (TAILQ_EMPTY(&state->subq))
1407                                 kdmsg_state_free(state);
1408                         lockmgr(&iocom->msglk, LK_RELEASE);
1409                 } else {
1410                         kdmsg_msg_free(msg);
1411                         lockmgr(&iocom->msglk, LK_RELEASE);
1412                 }
1413         } else {
1414                 kdmsg_msg_free(msg);
1415         }
1416 }
1417
1418 static
1419 void
1420 kdmsg_state_free(kdmsg_state_t *state)
1421 {
1422         kdmsg_iocom_t *iocom = state->iocom;
1423
1424         KKASSERT((state->flags & KDMSG_STATE_INSERTED) == 0);
1425         kfree(state, iocom->mmsg);
1426 }
1427
1428 kdmsg_msg_t *
1429 kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
1430                 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1431 {
1432         kdmsg_iocom_t *iocom = state->iocom;
1433         kdmsg_state_t *pstate;
1434         kdmsg_msg_t *msg;
1435         size_t hbytes;
1436
1437         KKASSERT(iocom != NULL);
1438         hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1439         msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1440                       iocom->mmsg, M_WAITOK | M_ZERO);
1441         msg->hdr_size = hbytes;
1442
1443         if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
1444                 /*
1445                  * New transaction, requires tracking state and a unique
1446                  * msgid to be allocated.
1447                  */
1448                 pstate = state;
1449                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1450                 TAILQ_INIT(&state->subq);
1451                 state->iocom = iocom;
1452                 state->parent = pstate;
1453                 state->flags = KDMSG_STATE_DYNAMIC;
1454                 state->func = func;
1455                 state->any.any = data;
1456                 state->msgid = (uint64_t)(uintptr_t)state;
1457                 /*msg->any.head.msgid = state->msgid;XXX*/
1458
1459                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1460                 if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1461                         panic("duplicate msgid allocated");
1462                 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
1463                 state->flags |= KDMSG_STATE_INSERTED;
1464                 lockmgr(&iocom->msglk, LK_RELEASE);
1465         } else {
1466                 pstate = state->parent;
1467         }
1468
1469         if (state->flags & KDMSG_STATE_OPPOSITE)
1470                 cmd |= DMSGF_REVTRANS;
1471         if (pstate->flags & KDMSG_STATE_OPPOSITE)
1472                 cmd |= DMSGF_REVCIRC;
1473
1474         msg->any.head.magic = DMSG_HDR_MAGIC;
1475         msg->any.head.cmd = cmd;
1476         msg->any.head.msgid = state->msgid;
1477         msg->any.head.circuit = pstate->msgid;
1478         msg->state = state;
1479
1480         return (msg);
1481 }
1482
1483 void
1484 kdmsg_msg_free(kdmsg_msg_t *msg)
1485 {
1486         kdmsg_iocom_t *iocom = msg->state->iocom;
1487
1488         if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1489             msg->aux_data && msg->aux_size) {
1490                 kfree(msg->aux_data, iocom->mmsg);
1491                 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1492         }
1493         msg->state = NULL;
1494         msg->aux_data = NULL;
1495         msg->aux_size = 0;
1496
1497         kfree(msg, iocom->mmsg);
1498 }
1499
1500 /*
1501  * Indexed messages are stored in a red-black tree indexed by their
1502  * msgid.  Only persistent messages are indexed.
1503  */
1504 int
1505 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1506 {
1507         if (state1->iocom < state2->iocom)
1508                 return(-1);
1509         if (state1->iocom > state2->iocom)
1510                 return(1);
1511         if (state1->msgid < state2->msgid)
1512                 return(-1);
1513         if (state1->msgid > state2->msgid)
1514                 return(1);
1515         return(0);
1516 }
1517
1518 /*
1519  * Write a message.  All requisit command flags have been set.
1520  *
1521  * If msg->state is non-NULL the message is written to the existing
1522  * transaction.  msgid will be set accordingly.
1523  *
1524  * If msg->state is NULL and CREATE is set new state is allocated and
1525  * (func, data) is installed.  A msgid is assigned.
1526  *
1527  * If msg->state is NULL and CREATE is not set the message is assumed
1528  * to be a one-way message.  The originator must assign the msgid
1529  * (or leave it 0, which is typical.
1530  *
1531  * This function merely queues the message to the management thread, it
1532  * does not write to the message socket/pipe.
1533  */
1534 void
1535 kdmsg_msg_write(kdmsg_msg_t *msg)
1536 {
1537         kdmsg_iocom_t *iocom = msg->state->iocom;
1538         kdmsg_state_t *state;
1539
1540         if (msg->state) {
1541                 /*
1542                  * Continuance or termination of existing transaction.
1543                  * The transaction could have been initiated by either end.
1544                  *
1545                  * (Function callback and aux data for the receive side can
1546                  * be replaced or left alone).
1547                  */
1548                 state = msg->state;
1549                 msg->any.head.msgid = state->msgid;
1550                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1551         } else {
1552                 /*
1553                  * One-off message (always uses msgid 0 to distinguish
1554                  * between a possibly lost in-transaction message due to
1555                  * competing aborts and a real one-off message?)
1556                  */
1557                 state = NULL;
1558                 msg->any.head.msgid = 0;
1559                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1560         }
1561
1562         /*
1563          * This flag is not set until after the tx thread has drained
1564          * the txmsgq and simulated responses.  After that point the
1565          * txthread is dead and can no longer simulate responses.
1566          *
1567          * Device drivers should never try to send a message once this
1568          * flag is set.  They should have detected (through the state
1569          * closures) that the link is in trouble.
1570          */
1571         if (iocom->flags & KDMSG_IOCOMF_EXITNOACC) {
1572                 lockmgr(&iocom->msglk, LK_RELEASE);
1573                 panic("kdmsg_msg_write: Attempt to write message to "
1574                       "terminated iocom\n");
1575         }
1576
1577         /*
1578          * Finish up the msg fields.  Note that msg->aux_size and the
1579          * aux_bytes stored in the message header represent the unaligned
1580          * (actual) bytes of data, but the buffer is sized to an aligned
1581          * size and the CRC is generated over the aligned length.
1582          */
1583         msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
1584         ++iocom->msg_seq;
1585
1586         if (msg->aux_data && msg->aux_size) {
1587                 uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
1588
1589                 msg->any.head.aux_bytes = msg->aux_size;
1590                 msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
1591         }
1592         msg->any.head.hdr_crc = 0;
1593         msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
1594
1595         TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
1596
1597         if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
1598                 atomic_clear_int(&iocom->msg_ctl,
1599                                  KDMSG_CLUSTERCTL_SLEEPING);
1600                 wakeup(&iocom->msg_ctl);
1601         }
1602
1603         lockmgr(&iocom->msglk, LK_RELEASE);
1604 }
1605
1606 /*
1607  * Reply to a message and terminate our side of the transaction.
1608  *
1609  * If msg->state is non-NULL we are replying to a one-way message.
1610  */
1611 void
1612 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
1613 {
1614         kdmsg_state_t *state = msg->state;
1615         kdmsg_msg_t *nmsg;
1616         uint32_t cmd;
1617
1618         /*
1619          * Reply with a simple error code and terminate the transaction.
1620          */
1621         cmd = DMSG_LNK_ERROR;
1622
1623         /*
1624          * Check if our direction has even been initiated yet, set CREATE.
1625          *
1626          * Check what direction this is (command or reply direction).  Note
1627          * that txcmd might not have been initiated yet.
1628          *
1629          * If our direction has already been closed we just return without
1630          * doing anything.
1631          */
1632         if (state != &state->iocom->state0) {
1633                 if (state->txcmd & DMSGF_DELETE)
1634                         return;
1635                 if ((state->txcmd & DMSGF_CREATE) == 0)
1636                         cmd |= DMSGF_CREATE;
1637                 if (state->txcmd & DMSGF_REPLY)
1638                         cmd |= DMSGF_REPLY;
1639                 cmd |= DMSGF_DELETE;
1640         } else {
1641                 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
1642                         cmd |= DMSGF_REPLY;
1643         }
1644
1645         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
1646         nmsg->any.head.error = error;
1647         kdmsg_msg_write(nmsg);
1648 }
1649
1650 /*
1651  * Reply to a message and continue our side of the transaction.
1652  *
1653  * If msg->state is non-NULL we are replying to a one-way message and this
1654  * function degenerates into the same as kdmsg_msg_reply().
1655  */
1656 void
1657 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
1658 {
1659         kdmsg_state_t *state = msg->state;
1660         kdmsg_msg_t *nmsg;
1661         uint32_t cmd;
1662
1663         /*
1664          * Return a simple result code, do NOT terminate the transaction.
1665          */
1666         cmd = DMSG_LNK_ERROR;
1667
1668         /*
1669          * Check if our direction has even been initiated yet, set CREATE.
1670          *
1671          * Check what direction this is (command or reply direction).  Note
1672          * that txcmd might not have been initiated yet.
1673          *
1674          * If our direction has already been closed we just return without
1675          * doing anything.
1676          */
1677         if (state != &state->iocom->state0) {
1678                 if (state->txcmd & DMSGF_DELETE)
1679                         return;
1680                 if ((state->txcmd & DMSGF_CREATE) == 0)
1681                         cmd |= DMSGF_CREATE;
1682                 if (state->txcmd & DMSGF_REPLY)
1683                         cmd |= DMSGF_REPLY;
1684                 /* continuing transaction, do not set MSGF_DELETE */
1685         } else {
1686                 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
1687                         cmd |= DMSGF_REPLY;
1688         }
1689
1690         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
1691         nmsg->any.head.error = error;
1692         kdmsg_msg_write(nmsg);
1693 }
1694
1695 /*
1696  * Reply to a message and terminate our side of the transaction.
1697  *
1698  * If msg->state is non-NULL we are replying to a one-way message.
1699  */
1700 void
1701 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
1702 {
1703         kdmsg_msg_t *nmsg;
1704         uint32_t cmd;
1705
1706         /*
1707          * Reply with a simple error code and terminate the transaction.
1708          */
1709         cmd = DMSG_LNK_ERROR;
1710
1711         /*
1712          * Check if our direction has even been initiated yet, set CREATE.
1713          *
1714          * Check what direction this is (command or reply direction).  Note
1715          * that txcmd might not have been initiated yet.
1716          *
1717          * If our direction has already been closed we just return without
1718          * doing anything.
1719          */
1720         KKASSERT(state);
1721         if (state->txcmd & DMSGF_DELETE)
1722                 return;
1723         if ((state->txcmd & DMSGF_CREATE) == 0)
1724                 cmd |= DMSGF_CREATE;
1725         if (state->txcmd & DMSGF_REPLY)
1726                 cmd |= DMSGF_REPLY;
1727         cmd |= DMSGF_DELETE;
1728
1729         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
1730         nmsg->any.head.error = error;
1731         kdmsg_msg_write(nmsg);
1732 }
1733
1734 /*
1735  * Reply to a message and continue our side of the transaction.
1736  *
1737  * If msg->state is non-NULL we are replying to a one-way message and this
1738  * function degenerates into the same as kdmsg_msg_reply().
1739  */
1740 void
1741 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
1742 {
1743         kdmsg_msg_t *nmsg;
1744         uint32_t cmd;
1745
1746         /*
1747          * Return a simple result code, do NOT terminate the transaction.
1748          */
1749         cmd = DMSG_LNK_ERROR;
1750
1751         /*
1752          * Check if our direction has even been initiated yet, set CREATE.
1753          *
1754          * Check what direction this is (command or reply direction).  Note
1755          * that txcmd might not have been initiated yet.
1756          *
1757          * If our direction has already been closed we just return without
1758          * doing anything.
1759          */
1760         KKASSERT(state);
1761         if (state->txcmd & DMSGF_DELETE)
1762                 return;
1763         if ((state->txcmd & DMSGF_CREATE) == 0)
1764                 cmd |= DMSGF_CREATE;
1765         if (state->txcmd & DMSGF_REPLY)
1766                 cmd |= DMSGF_REPLY;
1767         /* continuing transaction, do not set MSGF_DELETE */
1768
1769         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
1770         nmsg->any.head.error = error;
1771         kdmsg_msg_write(nmsg);
1772 }