usched: Add USCHED_SET_CPUMASK.
[dragonfly.git] / sys / kern / kern_dmsg.c
1 /*-
2  * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * TODO: txcmd CREATE state is deferred by tx msgq, need to calculate
36  *       a streaming response.  See subr_diskiocom()'s diskiodone().
37  */
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/sysctl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/priv.h>
53 #include <sys/thread.h>
54 #include <sys/globaldata.h>
55 #include <sys/limits.h>
56
57 #include <sys/dmsg.h>
58
59 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
60
61 SYSCTL_NODE(, OID_AUTO, kdmsg, CTLFLAG_RW, 0, "kdmsg");
62 static int kdmsg_debug = 1;
63 SYSCTL_INT(_kdmsg, OID_AUTO, debug, CTLFLAG_RW, &kdmsg_debug, 0,
64            "Set debug level for kernel dmsg layer");
65
66 #define kd_printf(level, ctl, ...)              \
67         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
68
69 #define kdio_printf(iocom, level, ctl, ...)      \
70         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
71
72 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
73 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
74 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
75 static void kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg);
76 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
77 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
78 static void kdmsg_subq_delete(kdmsg_state_t *state);
79 static void kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error);
80 static void kdmsg_state_abort(kdmsg_state_t *state);
81 static void kdmsg_state_dying(kdmsg_state_t *state);
82 static void kdmsg_state_free(kdmsg_state_t *state);
83
84 #ifdef KDMSG_DEBUG
85 #define KDMSG_DEBUG_ARGS        , const char *file, int line
86 #define kdmsg_state_hold(state) _kdmsg_state_hold(state, __FILE__, __LINE__)
87 #define kdmsg_state_drop(state) _kdmsg_state_drop(state, __FILE__, __LINE__)
88 #else
89 #define KDMSG_DEBUG 0
90 #define KDMSG_DEBUG_ARGS
91 #define kdmsg_state_hold(state) _kdmsg_state_hold(state)
92 #define kdmsg_state_drop(state) _kdmsg_state_drop(state)
93 #endif
94 static void _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
95 static void _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
96
97 static void kdmsg_iocom_thread_rd(void *arg);
98 static void kdmsg_iocom_thread_wr(void *arg);
99 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
100
101 /*static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);*/
102
103 /*
104  * Initialize the roll-up communications structure for a network
105  * messaging session.  This function does not install the socket.
106  */
107 void
108 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
109                  struct malloc_type *mmsg,
110                  int (*rcvmsg)(kdmsg_msg_t *msg))
111 {
112         bzero(iocom, sizeof(*iocom));
113         iocom->handle = handle;
114         iocom->mmsg = mmsg;
115         iocom->rcvmsg = rcvmsg;
116         iocom->flags = flags;
117         lockinit(&iocom->msglk, "h2msg", 0, 0);
118         TAILQ_INIT(&iocom->msgq);
119         RB_INIT(&iocom->staterd_tree);
120         RB_INIT(&iocom->statewr_tree);
121
122         iocom->state0.iocom = iocom;
123         iocom->state0.parent = &iocom->state0;
124         TAILQ_INIT(&iocom->state0.subq);
125 }
126
127 /*
128  * [Re]connect using the passed file pointer.  The caller must ref the
129  * fp for us.  We own that ref now.
130  */
131 void
132 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
133                       const char *subsysname)
134 {
135         /*
136          * Destroy the current connection
137          */
138         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
139         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
140         while (iocom->msgrd_td || iocom->msgwr_td) {
141                 wakeup(&iocom->msg_ctl);
142                 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
143         }
144
145         /*
146          * Drop communications descriptor
147          */
148         if (iocom->msg_fp) {
149                 fdrop(iocom->msg_fp);
150                 iocom->msg_fp = NULL;
151         }
152
153         /*
154          * Setup new communications descriptor
155          */
156         iocom->msg_ctl = 0;
157         iocom->msg_fp = fp;
158         iocom->msg_seq = 0;
159         iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
160
161         lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
162                     NULL, 0, -1, "%s-msgrd", subsysname);
163         lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
164                     NULL, 0, -1, "%s-msgwr", subsysname);
165         lockmgr(&iocom->msglk, LK_RELEASE);
166 }
167
168 /*
169  * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
170  * this function to handle the state machine for LNK_CONN and LNK_SPAN.
171  */
172 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
173 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
174
175 void
176 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
177                          void (*auto_callback)(kdmsg_msg_t *msg))
178 {
179         kdmsg_msg_t *msg;
180
181         iocom->auto_callback = auto_callback;
182
183         msg = kdmsg_msg_alloc(&iocom->state0,
184                               DMSG_LNK_CONN | DMSGF_CREATE,
185                               kdmsg_lnk_conn_reply, NULL);
186         iocom->auto_lnk_conn.head = msg->any.head;
187         msg->any.lnk_conn = iocom->auto_lnk_conn;
188         iocom->conn_state = msg->state;
189         kdmsg_state_hold(msg->state);   /* iocom->conn_state */
190         kdmsg_msg_write(msg);
191 }
192
193 static
194 int
195 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
196 {
197         kdmsg_iocom_t *iocom = state->iocom;
198         kdmsg_msg_t *rmsg;
199
200         /*
201          * Upon receipt of the LNK_CONN acknowledgement initiate an
202          * automatic SPAN if we were asked to.  Used by e.g. xdisk, but
203          * not used by HAMMER2 which must manage more than one transmitted
204          * SPAN.
205          */
206         if ((msg->any.head.cmd & DMSGF_CREATE) &&
207             (iocom->flags & KDMSG_IOCOMF_AUTOTXSPAN)) {
208                 rmsg = kdmsg_msg_alloc(&iocom->state0,
209                                        DMSG_LNK_SPAN | DMSGF_CREATE,
210                                        kdmsg_lnk_span_reply, NULL);
211                 iocom->auto_lnk_span.head = rmsg->any.head;
212                 rmsg->any.lnk_span = iocom->auto_lnk_span;
213                 kdmsg_msg_write(rmsg);
214         }
215
216         /*
217          * Process shim after the CONN is acknowledged and before the CONN
218          * transaction is deleted.  For deletions this gives device drivers
219          * the ability to interlock new operations on the circuit before
220          * it becomes illegal and panics.
221          */
222         if (iocom->auto_callback)
223                 iocom->auto_callback(msg);
224
225         if ((state->txcmd & DMSGF_DELETE) == 0 &&
226             (msg->any.head.cmd & DMSGF_DELETE)) {
227                 /*
228                  * iocom->conn_state has a state ref, drop it when clearing.
229                  */
230                 if (iocom->conn_state)
231                         kdmsg_state_drop(iocom->conn_state);
232                 iocom->conn_state = NULL;
233                 kdmsg_msg_reply(msg, 0);
234         }
235
236         return (0);
237 }
238
239 static
240 int
241 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
242 {
243         /*
244          * Be sure to process shim before terminating the SPAN
245          * transaction.  Gives device drivers the ability to
246          * interlock new operations on the circuit before it
247          * becomes illegal and panics.
248          */
249         if (state->iocom->auto_callback)
250                 state->iocom->auto_callback(msg);
251
252         if ((state->txcmd & DMSGF_DELETE) == 0 &&
253             (msg->any.head.cmd & DMSGF_DELETE)) {
254                 kdmsg_msg_reply(msg, 0);
255         }
256         return (0);
257 }
258
259 /*
260  * Disconnect and clean up
261  */
262 void
263 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
264 {
265         kdmsg_state_t *state;
266         kdmsg_msg_t *msg;
267         int retries;
268
269         /*
270          * Ask the cluster controller to go away by setting
271          * KILLRX.  Send a PING to get a response to unstick reading
272          * from the pipe.
273          *
274          * After 10 seconds shitcan the pipe and do an unclean shutdown.
275          */
276         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
277
278         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
279         msg = kdmsg_msg_alloc(&iocom->state0, DMSG_LNK_PING, NULL, NULL);
280         kdmsg_msg_write_locked(iocom, msg);
281
282         retries = 10;
283         while (iocom->msgrd_td || iocom->msgwr_td) {
284                 wakeup(&iocom->msg_ctl);
285                 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
286                 if (--retries == 0 && iocom->msg_fp) {
287                         kdio_printf(iocom, 0, "%s\n",
288                                     "iocom_uninit: "
289                                     "shitcanning unresponsive pipe");
290                         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
291                         /* retries allowed to go negative, keep looping */
292                 }
293         }
294
295         /*
296          * Cleanup caches
297          */
298         if ((state = iocom->freerd_state) != NULL) {
299                 iocom->freerd_state = NULL;
300                 kdmsg_state_drop(state);
301         }
302
303         if ((state = iocom->freewr_state) != NULL) {
304                 iocom->freewr_state = NULL;
305                 kdmsg_state_drop(state);
306         }
307
308         /*
309          * Drop communications descriptor
310          */
311         if (iocom->msg_fp) {
312                 fdrop(iocom->msg_fp);
313                 iocom->msg_fp = NULL;
314         }
315         lockmgr(&iocom->msglk, LK_RELEASE);
316 }
317
318 /*
319  * Cluster controller thread.  Perform messaging functions.  We have one
320  * thread for the reader and one for the writer.  The writer handles
321  * shutdown requests (which should break the reader thread).
322  */
323 static
324 void
325 kdmsg_iocom_thread_rd(void *arg)
326 {
327         kdmsg_iocom_t *iocom = arg;
328         dmsg_hdr_t hdr;
329         kdmsg_msg_t *msg = NULL;
330         size_t hbytes;
331         size_t abytes;
332         int error = 0;
333
334         while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLRX) == 0) {
335                 /*
336                  * Retrieve the message from the pipe or socket.
337                  */
338                 error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
339                                 NULL, 1, UIO_SYSSPACE);
340                 if (error)
341                         break;
342                 if (hdr.magic != DMSG_HDR_MAGIC) {
343                         kdio_printf(iocom, 1, "bad magic: %04x\n", hdr.magic);
344                         error = EINVAL;
345                         break;
346                 }
347                 hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
348                 if (hbytes < sizeof(hdr) || hbytes > DMSG_HDR_MAX) {
349                         kdio_printf(iocom, 1, "bad header size %zd\n", hbytes);
350                         error = EINVAL;
351                         break;
352                 }
353
354                 /* XXX messy: mask cmd to avoid allocating state */
355                 msg = kdmsg_msg_alloc(&iocom->state0,
356                                       hdr.cmd & DMSGF_BASECMDMASK,
357                                       NULL, NULL);
358                 msg->any.head = hdr;
359                 msg->hdr_size = hbytes;
360                 if (hbytes > sizeof(hdr)) {
361                         error = fp_read(iocom->msg_fp, &msg->any.head + 1,
362                                         hbytes - sizeof(hdr),
363                                         NULL, 1, UIO_SYSSPACE);
364                         if (error) {
365                                 kdio_printf(iocom, 1, "%s\n",
366                                             "short msg received");
367                                 error = EINVAL;
368                                 break;
369                         }
370                 }
371                 msg->aux_size = hdr.aux_bytes;
372                 if (msg->aux_size > DMSG_AUX_MAX) {
373                         kdio_printf(iocom, 1,
374                                     "illegal msg payload size %zd\n",
375                                     msg->aux_size);
376                         error = EINVAL;
377                         break;
378                 }
379                 if (msg->aux_size) {
380                         abytes = DMSG_DOALIGN(msg->aux_size);
381                         msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
382                         msg->flags |= KDMSG_FLAG_AUXALLOC;
383                         error = fp_read(iocom->msg_fp, msg->aux_data,
384                                         abytes, NULL, 1, UIO_SYSSPACE);
385                         if (error) {
386                                 kdio_printf(iocom, 1, "%s\n",
387                                             "short msg payload received");
388                                 break;
389                         }
390                 }
391
392                 error = kdmsg_msg_receive_handling(msg);
393                 msg = NULL;
394         }
395
396         kdio_printf(iocom, 1, "read thread terminating error=%d\n", error);
397
398         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
399         if (msg)
400                 kdmsg_msg_free(msg);
401
402         /*
403          * Shutdown the socket and set KILLRX for consistency in case the
404          * shutdown was not commanded.  Signal the transmit side to shutdown
405          * by setting KILLTX and waking it up.
406          */
407         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
408         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
409                                         KDMSG_CLUSTERCTL_KILLTX);
410         iocom->msgrd_td = NULL;
411         lockmgr(&iocom->msglk, LK_RELEASE);
412         wakeup(&iocom->msg_ctl);
413
414         /*
415          * iocom can be ripped out at any time once the lock is
416          * released with msgrd_td set to NULL.  The wakeup()s are safe but
417          * that is all.
418          */
419         wakeup(iocom);
420         lwkt_exit();
421 }
422
423 static
424 void
425 kdmsg_iocom_thread_wr(void *arg)
426 {
427         kdmsg_iocom_t *iocom = arg;
428         kdmsg_msg_t *msg;
429         ssize_t res;
430         size_t abytes;
431         int error = 0;
432         int save_ticks;
433         int didwarn;
434
435         /*
436          * Transmit loop
437          */
438         msg = NULL;
439         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
440
441         while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0 && error == 0) {
442                 /*
443                  * Sleep if no messages pending.  Interlock with flag while
444                  * holding msglk.
445                  */
446                 if (TAILQ_EMPTY(&iocom->msgq)) {
447                         atomic_set_int(&iocom->msg_ctl,
448                                        KDMSG_CLUSTERCTL_SLEEPING);
449                         lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
450                         atomic_clear_int(&iocom->msg_ctl,
451                                          KDMSG_CLUSTERCTL_SLEEPING);
452                 }
453
454                 while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
455                         /*
456                          * Remove msg from the transmit queue and do
457                          * persist and half-closed state handling.
458                          */
459                         TAILQ_REMOVE(&iocom->msgq, msg, qentry);
460
461                         error = kdmsg_state_msgtx(msg);
462                         if (error == EALREADY) {
463                                 error = 0;
464                                 kdmsg_msg_free(msg);
465                                 continue;
466                         }
467                         if (error) {
468                                 kdmsg_msg_free(msg);
469                                 break;
470                         }
471
472                         /*
473                          * Dump the message to the pipe or socket.
474                          *
475                          * We have to clean up the message as if the transmit
476                          * succeeded even if it failed.
477                          */
478                         lockmgr(&iocom->msglk, LK_RELEASE);
479                         error = fp_write(iocom->msg_fp, &msg->any,
480                                          msg->hdr_size, &res, UIO_SYSSPACE);
481                         if (error || res != msg->hdr_size) {
482                                 if (error == 0)
483                                         error = EINVAL;
484                                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
485                                 kdmsg_state_cleanuptx(msg);
486                                 break;
487                         }
488                         if (msg->aux_size) {
489                                 abytes = DMSG_DOALIGN(msg->aux_size);
490                                 error = fp_write(iocom->msg_fp,
491                                                  msg->aux_data, abytes,
492                                                  &res, UIO_SYSSPACE);
493                                 if (error || res != abytes) {
494                                         if (error == 0)
495                                                 error = EINVAL;
496                                         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
497                                         kdmsg_state_cleanuptx(msg);
498                                         break;
499                                 }
500                         }
501                         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
502                         kdmsg_state_cleanuptx(msg);
503                 }
504         }
505
506         kdio_printf(iocom, 1, "write thread terminating error=%d\n", error);
507
508         /*
509          * Shutdown the socket and set KILLTX for consistency in case the
510          * shutdown was not commanded.  Signal the receive side to shutdown
511          * by setting KILLRX and waking it up.
512          */
513         fp_shutdown(iocom->msg_fp, SHUT_RDWR);
514         atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
515                                         KDMSG_CLUSTERCTL_KILLTX);
516         wakeup(&iocom->msg_ctl);
517
518         /*
519          * The transmit thread is responsible for final cleanups, wait
520          * for the receive side to terminate to prevent new received
521          * states from interfering with our cleanup.
522          *
523          * Do not set msgwr_td to NULL until we actually exit.
524          */
525         while (iocom->msgrd_td) {
526                 wakeup(&iocom->msg_ctl);
527                 lksleep(iocom, &iocom->msglk, 0, "clstrkt", hz);
528         }
529
530         /*
531          * We can no longer receive new messages.  We must drain the transmit
532          * message queue and simulate received messages to close anay remaining
533          * states.
534          *
535          * Loop until all the states are gone and there are no messages
536          * pending transmit.
537          */
538         save_ticks = ticks;
539         didwarn = 0;
540
541         while (TAILQ_FIRST(&iocom->msgq) ||
542                RB_ROOT(&iocom->staterd_tree) ||
543                RB_ROOT(&iocom->statewr_tree)) {
544                 /*
545                  * Simulate failure for all sub-states of state0.
546                  */
547                 kdmsg_drain_msgq(iocom);
548                 kdio_printf(iocom, 2, "%s\n",
549                             "simulate failure for all substates of state0");
550                 kdmsg_simulate_failure(&iocom->state0, 0, DMSG_ERR_LOSTLINK);
551
552                 lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz / 2);
553
554                 if ((int)(ticks - save_ticks) > hz*2 && didwarn == 0) {
555                         didwarn = 1;
556                         kdio_printf(iocom, 0,
557                                     "Warning, write thread on %p "
558                                     "still terminating\n",
559                                     iocom);
560                 }
561                 if ((int)(ticks - save_ticks) > hz*15 && didwarn == 1) {
562                         didwarn = 2;
563                         kdio_printf(iocom, 0,
564                                     "Warning, write thread on %p "
565                                     "still terminating\n",
566                                     iocom);
567                 }
568                 if ((int)(ticks - save_ticks) > hz*60) {
569                         kdio_printf(iocom, 0,
570                                     "Can't terminate: msgq %p "
571                                     "rd_tree %p wr_tree %p\n",
572                                     TAILQ_FIRST(&iocom->msgq),
573                                     RB_ROOT(&iocom->staterd_tree),
574                                     RB_ROOT(&iocom->statewr_tree));
575                         lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz * 10);
576                 }
577         }
578
579         /*
580          * Exit handling is done by the write thread.
581          */
582         iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
583         lockmgr(&iocom->msglk, LK_RELEASE);
584
585         /*
586          * The state trees had better be empty now
587          */
588         KKASSERT(RB_EMPTY(&iocom->staterd_tree));
589         KKASSERT(RB_EMPTY(&iocom->statewr_tree));
590         KKASSERT(iocom->conn_state == NULL);
591
592         if (iocom->exit_func) {
593                 /*
594                  * iocom is invalid after we call the exit function.
595                  */
596                 iocom->msgwr_td = NULL;
597                 iocom->exit_func(iocom);
598         } else {
599                 /*
600                  * iocom can be ripped out from under us once msgwr_td is
601                  * set to NULL.  The wakeup is safe.
602                  */
603                 iocom->msgwr_td = NULL;
604                 wakeup(iocom);
605         }
606         lwkt_exit();
607 }
608
609 /*
610  * This cleans out the pending transmit message queue, adjusting any
611  * persistent states properly in the process.
612  *
613  * Called with iocom locked.
614  */
615 void
616 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
617 {
618         kdmsg_msg_t *msg;
619
620         /*
621          * Clean out our pending transmit queue, executing the
622          * appropriate state adjustments.  If this tries to open
623          * any new outgoing transactions we have to loop up and
624          * clean them out.
625          */
626         while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
627                 TAILQ_REMOVE(&iocom->msgq, msg, qentry);
628                 if (kdmsg_state_msgtx(msg))
629                         kdmsg_msg_free(msg);
630                 else
631                         kdmsg_state_cleanuptx(msg);
632         }
633 }
634
635 /*
636  * Do all processing required to handle a freshly received message
637  * after its low level header has been validated.
638  *
639  * iocom is not locked.
640  */
641 static
642 int
643 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
644 {
645         kdmsg_iocom_t *iocom = msg->state->iocom;
646         int error;
647
648         /*
649          * State machine tracking, state assignment for msg,
650          * returns error and discard status.  Errors are fatal
651          * to the connection except for EALREADY which forces
652          * a discard without execution.
653          */
654         error = kdmsg_state_msgrx(msg);
655         if (msg->state->flags & KDMSG_STATE_ABORTING) {
656                 kdio_printf(iocom, 5,
657                             "kdmsg_state_abort(b): state %p rxcmd=%08x "
658                             "txcmd=%08x msgrx error %d\n",
659                             msg->state, msg->state->rxcmd,
660                             msg->state->txcmd, error);
661         }
662         if (error) {
663                 /*
664                  * Raw protocol or connection error
665                  */
666                 if (msg->state->flags & KDMSG_STATE_ABORTING)
667                         kdio_printf(iocom, 5,
668                                     "X1 state %p error %d\n",
669                                     msg->state, error);
670                 kdmsg_msg_free(msg);
671                 if (error == EALREADY)
672                         error = 0;
673         } else if (msg->state && msg->state->func) {
674                 /*
675                  * Message related to state which already has a
676                  * handling function installed for it.
677                  */
678                 if (msg->state->flags & KDMSG_STATE_ABORTING)
679                         kdio_printf(iocom, 5,
680                                     "X2 state %p func %p\n",
681                                     msg->state, msg->state->func);
682                 error = msg->state->func(msg->state, msg);
683                 kdmsg_state_cleanuprx(msg);
684         } else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
685                 if (msg->state->flags & KDMSG_STATE_ABORTING)
686                         kdio_printf(iocom, 5,
687                                     "X3 state %p\n", msg->state);
688                 error = kdmsg_autorxmsg(msg);
689                 kdmsg_state_cleanuprx(msg);
690         } else {
691                 if (msg->state->flags & KDMSG_STATE_ABORTING)
692                         kdio_printf(iocom, 5,
693                                     "X4 state %p\n", msg->state);
694                 error = iocom->rcvmsg(msg);
695                 kdmsg_state_cleanuprx(msg);
696         }
697         return error;
698 }
699
700 /*
701  * Process state tracking for a message after reception and dequeueing,
702  * prior to execution of the state callback.  The state is updated and
703  * will be removed from the RBTREE if completely closed, but the state->parent
704  * and subq linkage is not cleaned up until after the callback (see
705  * cleanuprx()).
706  *
707  * msglk is not held.
708  *
709  * NOTE: A message transaction can consist of several messages in either
710  *       direction.
711  *
712  * NOTE: The msgid is unique to the initiator, not necessarily unique for
713  *       us or for any relay or for the return direction for that matter.
714  *       That is, two sides sending a new message can use the same msgid
715  *       without colliding.
716  *
717  * --
718  *
719  * ABORT sequences work by setting the ABORT flag along with normal message
720  * state.  However, ABORTs can also be sent on half-closed messages, that is
721  * even if the command or reply side has already sent a DELETE, as long as
722  * the message has not been fully closed it can still send an ABORT+DELETE
723  * to terminate the half-closed message state.
724  *
725  * Since ABORT+DELETEs can race we silently discard ABORT's for message
726  * state which has already been fully closed.  REPLY+ABORT+DELETEs can
727  * also race, and in this situation the other side might have already
728  * initiated a new unrelated command with the same message id.  Since
729  * the abort has not set the CREATE flag the situation can be detected
730   * and the message will also be discarded.
731  *
732  * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
733  * The ABORT request is essentially integrated into the command instead
734  * of being sent later on.  In this situation the command implementation
735  * detects that CREATE and ABORT are both set (vs ABORT alone) and can
736  * special-case non-blocking operation for the command.
737  *
738  * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
739  *        to be mid-stream aborts for command/reply sequences.  ABORTs on
740  *        one-way messages are not supported.
741  *
742  * NOTE!  If a command sequence does not support aborts the ABORT flag is
743  *        simply ignored.
744  *
745  * --
746  *
747  * One-off messages (no reply expected) are sent with neither CREATE or DELETE
748  * set.  One-off messages cannot be aborted and typically aren't processed
749  * by these routines.  The REPLY bit can be used to distinguish whether a
750  * one-off message is a command or reply.  For example, one-off replies
751  * will typically just contain status updates.
752  */
753 static
754 int
755 kdmsg_state_msgrx(kdmsg_msg_t *msg)
756 {
757         kdmsg_iocom_t *iocom = msg->state->iocom;
758         kdmsg_state_t *state;
759         kdmsg_state_t *pstate;
760         kdmsg_state_t sdummy;
761         int error;
762
763         bzero(&sdummy, sizeof(sdummy)); /* avoid gcc warnings */
764
765         /*
766          * Make sure a state structure is ready to go in case we need a new
767          * one.  This is the only routine which uses freerd_state so no
768          * races are possible.
769          */
770         if ((state = iocom->freerd_state) == NULL) {
771                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
772                 state->flags = KDMSG_STATE_DYNAMIC;
773                 state->iocom = iocom;
774                 state->refs = 1;
775                 TAILQ_INIT(&state->subq);
776                 iocom->freerd_state = state;
777         }
778         state = NULL;   /* safety */
779
780         /*
781          * Lock RB tree and locate existing persistent state, if any.
782          *
783          * If received msg is a command state is on staterd_tree.
784          * If received msg is a reply state is on statewr_tree.
785          */
786         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
787
788 again:
789         if (msg->state == &iocom->state0) {
790                 sdummy.msgid = msg->any.head.msgid;
791                 sdummy.iocom = iocom;
792                 if (msg->any.head.cmd & DMSGF_REVTRANS) {
793                         state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree,
794                                         &sdummy);
795                 } else {
796                         state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree,
797                                         &sdummy);
798                 }
799
800                 /*
801                  * Set message state unconditionally.  If this is a CREATE
802                  * message this state will become the parent state and new
803                  * state will be allocated for the message state.
804                  */
805                 if (state == NULL)
806                         state = &iocom->state0;
807                 if (state->flags & KDMSG_STATE_INTERLOCK) {
808                         state->flags |= KDMSG_STATE_SIGNAL;
809                         lksleep(state, &iocom->msglk, 0, "dmrace", hz);
810                         goto again;
811                 }
812                 kdmsg_state_hold(state);
813                 kdmsg_state_drop(msg->state);   /* iocom->state0 */
814                 msg->state = state;
815         } else {
816                 state = msg->state;
817         }
818
819         /*
820          * Short-cut one-off or mid-stream messages.
821          */
822         if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
823                                   DMSGF_ABORT)) == 0) {
824                 error = 0;
825                 goto done;
826         }
827
828         /*
829          * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
830          * inside the case statements.
831          */
832         switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
833         case DMSGF_CREATE:
834         case DMSGF_CREATE | DMSGF_DELETE:
835                 /*
836                  * New persistant command received.
837                  */
838                 if (state != &iocom->state0) {
839                         kdio_printf(iocom, 1, "%s\n",
840                                     "duplicate transaction");
841                         error = EINVAL;
842                         break;
843                 }
844
845                 /*
846                  * Lookup the circuit.  The circuit is an open transaction.
847                  * the REVCIRC bit in the message tells us which side
848                  * initiated the transaction representing the circuit.
849                  */
850                 if (msg->any.head.circuit) {
851                         sdummy.msgid = msg->any.head.circuit;
852
853                         if (msg->any.head.cmd & DMSGF_REVCIRC) {
854                                 pstate = RB_FIND(kdmsg_state_tree,
855                                                  &iocom->statewr_tree,
856                                                  &sdummy);
857                         } else {
858                                 pstate = RB_FIND(kdmsg_state_tree,
859                                                  &iocom->staterd_tree,
860                                                  &sdummy);
861                         }
862                         if (pstate == NULL) {
863                                 kdio_printf(iocom, 1, "%s\n",
864                                             "missing parent in "
865                                             "stacked trans");
866                                 error = EINVAL;
867                                 break;
868                         }
869                 } else {
870                         pstate = &iocom->state0;
871                 }
872
873                 /*
874                  * Allocate new state.
875                  *
876                  * msg->state becomes the owner of the ref we inherit from
877                  * freerd_stae.
878                  */
879                 kdmsg_state_drop(state);
880                 state = iocom->freerd_state;
881                 iocom->freerd_state = NULL;
882
883                 msg->state = state;             /* inherits freerd ref */
884                 state->parent = pstate;
885                 KKASSERT(state->iocom == iocom);
886                 state->flags |= KDMSG_STATE_RBINSERTED |
887                                 KDMSG_STATE_SUBINSERTED |
888                                 KDMSG_STATE_OPPOSITE;
889                 if (TAILQ_EMPTY(&pstate->subq))
890                         kdmsg_state_hold(pstate);/* states on pstate->subq */
891                 kdmsg_state_hold(state);        /* state on pstate->subq */
892                 kdmsg_state_hold(state);        /* state on rbtree */
893                 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
894                 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
895                 state->txcmd = DMSGF_REPLY;
896                 state->msgid = msg->any.head.msgid;
897                 state->flags &= ~KDMSG_STATE_NEW;
898                 RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
899                 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
900                 error = 0;
901                 break;
902         case DMSGF_DELETE:
903                 /*
904                  * Persistent state is expected but might not exist if an
905                  * ABORT+DELETE races the close.
906                  */
907                 if (state == &iocom->state0) {
908                         if (msg->any.head.cmd & DMSGF_ABORT) {
909                                 kdio_printf(iocom, 1, "%s\n",
910                                             "msgrx: "
911                                             "state already A");
912                                 error = EALREADY;
913                         } else {
914                                 kdio_printf(iocom, 1, "%s\n",
915                                             "msgrx: no state for DELETE");
916                                 error = EINVAL;
917                         }
918                         break;
919                 }
920
921                 /*
922                  * Handle another ABORT+DELETE case if the msgid has already
923                  * been reused.
924                  */
925                 if ((state->rxcmd & DMSGF_CREATE) == 0) {
926                         if (msg->any.head.cmd & DMSGF_ABORT) {
927                                 kdio_printf(iocom, 1, "%s\n",
928                                             "msgrx: state already B");
929                                 error = EALREADY;
930                         } else {
931                                 kdio_printf(iocom, 1, "%s\n",
932                                             "msgrx: state reused for DELETE");
933                                 error = EINVAL;
934                         }
935                         break;
936                 }
937                 error = 0;
938                 break;
939         default:
940                 /*
941                  * Check for mid-stream ABORT command received, otherwise
942                  * allow.
943                  */
944                 if (msg->any.head.cmd & DMSGF_ABORT) {
945                         if (state == &iocom->state0 ||
946                             (state->rxcmd & DMSGF_CREATE) == 0) {
947                                 error = EALREADY;
948                                 break;
949                         }
950                 }
951                 error = 0;
952                 break;
953         case DMSGF_REPLY | DMSGF_CREATE:
954         case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
955                 /*
956                  * When receiving a reply with CREATE set the original
957                  * persistent state message should already exist.
958                  */
959                 if (state == &iocom->state0) {
960                         kdio_printf(iocom, 1,
961                                     "msgrx: no state match for "
962                                     "REPLY cmd=%08x msgid=%016jx\n",
963                                     msg->any.head.cmd,
964                                     (intmax_t)msg->any.head.msgid);
965                         error = EINVAL;
966                         break;
967                 }
968                 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
969                 error = 0;
970                 break;
971         case DMSGF_REPLY | DMSGF_DELETE:
972                 /*
973                  * Received REPLY+ABORT+DELETE in case where msgid has
974                  * already been fully closed, ignore the message.
975                  */
976                 if (state == &iocom->state0) {
977                         if (msg->any.head.cmd & DMSGF_ABORT) {
978                                 error = EALREADY;
979                         } else {
980                                 kdio_printf(iocom, 1, "%s\n",
981                                             "msgrx: no state match "
982                                             "for REPLY|DELETE");
983                                 error = EINVAL;
984                         }
985                         break;
986                 }
987
988                 /*
989                  * Received REPLY+ABORT+DELETE in case where msgid has
990                  * already been reused for an unrelated message,
991                  * ignore the message.
992                  */
993                 if ((state->rxcmd & DMSGF_CREATE) == 0) {
994                         if (msg->any.head.cmd & DMSGF_ABORT) {
995                                 error = EALREADY;
996                         } else {
997                                 kdio_printf(iocom, 1, "%s\n",
998                                             "msgrx: state reused "
999                                             "for REPLY|DELETE");
1000                                 error = EINVAL;
1001                         }
1002                         break;
1003                 }
1004                 error = 0;
1005                 break;
1006         case DMSGF_REPLY:
1007                 /*
1008                  * Check for mid-stream ABORT reply received to sent command.
1009                  */
1010                 if (msg->any.head.cmd & DMSGF_ABORT) {
1011                         if (state == &iocom->state0 ||
1012                             (state->rxcmd & DMSGF_CREATE) == 0) {
1013                                 error = EALREADY;
1014                                 break;
1015                         }
1016                 }
1017                 error = 0;
1018                 break;
1019         }
1020
1021         /*
1022          * Calculate the easy-switch() transactional command.  Represents
1023          * the outer-transaction command for any transaction-create or
1024          * transaction-delete, and the inner message command for any
1025          * non-transaction or inside-transaction command.  tcmd will be
1026          * set to 0 if the message state is illegal.
1027          *
1028          * The two can be told apart because outer-transaction commands
1029          * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
1030          */
1031 done:
1032         if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
1033                 if (state != &iocom->state0) {
1034                         msg->tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1035                                     (msg->any.head.cmd & (DMSGF_CREATE |
1036                                                           DMSGF_DELETE |
1037                                                           DMSGF_REPLY));
1038                 } else {
1039                         msg->tcmd = 0;
1040                 }
1041         } else {
1042                 msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
1043         }
1044
1045         /*
1046          * Adjust the state for DELETE handling now, before making the
1047          * callback so we are atomic with other state updates.
1048          *
1049          * Subq/parent linkages are cleaned up after the callback.
1050          * If an error occurred the message is ignored and state is not
1051          * updated.
1052          */
1053         if ((state = msg->state) == NULL || error != 0) {
1054                 kdio_printf(iocom, 1,
1055                             "msgrx: state=%p error %d\n",
1056                             state, error);
1057         } else if (msg->any.head.cmd & DMSGF_DELETE) {
1058                 KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1059                 state->rxcmd |= DMSGF_DELETE;
1060                 if (state->txcmd & DMSGF_DELETE) {
1061                         KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1062                         if (state->rxcmd & DMSGF_REPLY) {
1063                                 KKASSERT(msg->any.head.cmd &
1064                                          DMSGF_REPLY);
1065                                 RB_REMOVE(kdmsg_state_tree,
1066                                           &iocom->statewr_tree, state);
1067                         } else {
1068                                 KKASSERT((msg->any.head.cmd &
1069                                           DMSGF_REPLY) == 0);
1070                                 RB_REMOVE(kdmsg_state_tree,
1071                                           &iocom->staterd_tree, state);
1072                         }
1073                         state->flags &= ~KDMSG_STATE_RBINSERTED;
1074                         kdmsg_state_drop(state);        /* state on rbtree */
1075                 }
1076         }
1077         lockmgr(&iocom->msglk, LK_RELEASE);
1078
1079         return (error);
1080 }
1081
1082 /*
1083  * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
1084  * This routine must call iocom->rcvmsg() for anything not automatically
1085  * handled.
1086  */
1087 static int
1088 kdmsg_autorxmsg(kdmsg_msg_t *msg)
1089 {
1090         kdmsg_iocom_t *iocom = msg->state->iocom;
1091         kdmsg_msg_t *rep;
1092         int error = 0;
1093         uint32_t cmd;
1094
1095         /*
1096          * Main switch processes transaction create/delete sequences only.
1097          * Use icmd (DELETEs use DMSG_LNK_ERROR
1098          *
1099          * NOTE: If processing in-transaction messages you generally want
1100          *       an inner switch on msg->any.head.cmd.
1101          */
1102         if (msg->state) {
1103                 cmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1104                       (msg->any.head.cmd & (DMSGF_CREATE |
1105                                             DMSGF_DELETE |
1106                                             DMSGF_REPLY));
1107         } else {
1108                 cmd = 0;
1109         }
1110
1111         switch(cmd) {
1112         case DMSG_LNK_PING:
1113                 /*
1114                  * Received ping, send reply
1115                  */
1116                 rep = kdmsg_msg_alloc(msg->state, DMSG_LNK_PING | DMSGF_REPLY,
1117                                       NULL, NULL);
1118                 kdmsg_msg_write(rep);
1119                 break;
1120         case DMSG_LNK_PING | DMSGF_REPLY:
1121                 /* ignore replies */
1122                 break;
1123         case DMSG_LNK_CONN | DMSGF_CREATE:
1124         case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
1125                 /*
1126                  * Received LNK_CONN transaction.  Transmit response and
1127                  * leave transaction open, which allows the other end to
1128                  * start to the SPAN protocol.
1129                  *
1130                  * Handle shim after acknowledging the CONN.
1131                  */
1132                 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1133                         if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1134                                 kdmsg_msg_result(msg, 0);
1135                                 if (iocom->auto_callback)
1136                                         iocom->auto_callback(msg);
1137                         } else {
1138                                 error = iocom->rcvmsg(msg);
1139                         }
1140                         break;
1141                 }
1142                 /* fall through */
1143         case DMSG_LNK_CONN | DMSGF_DELETE:
1144                 /*
1145                  * This message is usually simulated after a link is lost
1146                  * to clean up the transaction.
1147                  */
1148                 if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1149                         if (iocom->auto_callback)
1150                                 iocom->auto_callback(msg);
1151                         kdmsg_msg_reply(msg, 0);
1152                 } else {
1153                         error = iocom->rcvmsg(msg);
1154                 }
1155                 break;
1156         case DMSG_LNK_SPAN | DMSGF_CREATE:
1157         case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1158                 /*
1159                  * Received LNK_SPAN transaction.  We do not have to respond
1160                  * (except on termination), but we must leave the transaction
1161                  * open.
1162                  *
1163                  * Handle shim after acknowledging the SPAN.
1164                  */
1165                 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1166                         if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1167                                 if (iocom->auto_callback)
1168                                         iocom->auto_callback(msg);
1169                                 break;
1170                         }
1171                         /* fall through */
1172                 } else {
1173                         error = iocom->rcvmsg(msg);
1174                         break;
1175                 }
1176                 /* fall through */
1177         case DMSG_LNK_SPAN | DMSGF_DELETE:
1178                 /*
1179                  * Process shims (auto_callback) before cleaning up the
1180                  * circuit structure and closing the transactions.  Device
1181                  * driver should ensure that the circuit is not used after
1182                  * the auto_callback() returns.
1183                  *
1184                  * Handle shim before closing the SPAN transaction.
1185                  */
1186                 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1187                         if (iocom->auto_callback)
1188                                 iocom->auto_callback(msg);
1189                         kdmsg_msg_reply(msg, 0);
1190                 } else {
1191                         error = iocom->rcvmsg(msg);
1192                 }
1193                 break;
1194         default:
1195                 /*
1196                  * Anything unhandled goes into rcvmsg.
1197                  *
1198                  * NOTE: Replies to link-level messages initiated by our side
1199                  *       are handled by the state callback, they are NOT
1200                  *       handled here.
1201                  */
1202                 error = iocom->rcvmsg(msg);
1203                 break;
1204         }
1205         return (error);
1206 }
1207
1208 /*
1209  * Post-receive-handling message and state cleanup.  This routine is called
1210  * after the state function handling/callback to properly dispose of the
1211  * message and unlink the state's parent/subq linkage if the state is
1212  * completely closed.
1213  *
1214  * msglk is not held.
1215  */
1216 static
1217 void
1218 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1219 {
1220         kdmsg_state_t *state = msg->state;
1221         kdmsg_iocom_t *iocom = state->iocom;
1222
1223         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1224         if (state != &iocom->state0) {
1225                 /*
1226                  * When terminating a transaction (in either direction), all
1227                  * sub-states are aborted.
1228                  */
1229                 if ((msg->any.head.cmd & DMSGF_DELETE) &&
1230                     TAILQ_FIRST(&msg->state->subq)) {
1231                         kdio_printf(iocom, 2,
1232                                     "simulate failure for substates of "
1233                                     "state %p cmd %08x/%08x\n",
1234                                     msg->state,
1235                                     msg->state->rxcmd,
1236                                     msg->state->txcmd);
1237                         kdmsg_simulate_failure(msg->state,
1238                                                0, DMSG_ERR_LOSTLINK);
1239                 }
1240
1241                 /*
1242                  * Once the state is fully closed we can (try to) remove it
1243                  * from the subq topology.
1244                  */
1245                 if ((state->flags & KDMSG_STATE_SUBINSERTED) &&
1246                     (state->rxcmd & DMSGF_DELETE) &&
1247                     (state->txcmd & DMSGF_DELETE)) {
1248                         /* 
1249                          * Remove parent linkage if state is completely closed.
1250                          */
1251                         kdmsg_subq_delete(state);
1252                 }
1253         }
1254         kdmsg_msg_free(msg);
1255
1256         lockmgr(&iocom->msglk, LK_RELEASE);
1257 }
1258
1259 /*
1260  * Remove state from its parent's subq.  This can wind up recursively
1261  * dropping the parent upward.
1262  *
1263  * NOTE: Once we drop the parent, our pstate pointer may become invalid.
1264  */
1265 static
1266 void
1267 kdmsg_subq_delete(kdmsg_state_t *state)
1268 {
1269         kdmsg_state_t *pstate;
1270
1271         if (state->flags & KDMSG_STATE_SUBINSERTED) {
1272                 pstate = state->parent;
1273                 KKASSERT(pstate);
1274                 if (pstate->scan == state)
1275                         pstate->scan = NULL;
1276                 TAILQ_REMOVE(&pstate->subq, state, entry);
1277                 state->flags &= ~KDMSG_STATE_SUBINSERTED;
1278                 state->parent = NULL;
1279                 if (TAILQ_EMPTY(&pstate->subq)) {
1280                         kdmsg_state_drop(pstate);/* pstate->subq */
1281                 }
1282                 pstate = NULL;                   /* safety */
1283                 kdmsg_state_drop(state);         /* pstate->subq */
1284         } else {
1285                 KKASSERT(state->parent == NULL);
1286         }
1287 }
1288
1289 /*
1290  * Simulate receiving a message which terminates an active transaction
1291  * state.  Our simulated received message must set DELETE and may also
1292  * have to set CREATE.  It must also ensure that all fields are set such
1293  * that the receive handling code can find the state (kdmsg_state_msgrx())
1294  * or an endless loop will ensue.
1295  *
1296  * This is used when the other end of the link is dead so the device driver
1297  * gets a completed transaction for all pending states.
1298  *
1299  * Called with iocom locked.
1300  */
1301 static
1302 void
1303 kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error)
1304 {
1305         kdmsg_state_t *substate;
1306
1307         kdmsg_state_hold(state);                /* aborting */
1308
1309         /*
1310          * Abort parent state first. Parent will not actually disappear
1311          * until children are gone.  Device drivers must handle the situation.
1312          * The advantage of this is that device drivers can flag the situation
1313          * as an interlock against new operations on dying states.  And since
1314          * device operations are often asynchronous anyway, this sequence of
1315          * events works out better.
1316          */
1317         if (meto)
1318                 kdmsg_state_abort(state);
1319
1320         /*
1321          * Recurse through any children.
1322          */
1323 again:
1324         TAILQ_FOREACH(substate, &state->subq, entry) {
1325                 if (substate->flags & KDMSG_STATE_ABORTING)
1326                         continue;
1327                 state->scan = substate;
1328                 kdmsg_simulate_failure(substate, 1, error);
1329                 if (state->scan != substate)
1330                         goto again;
1331         }
1332         kdmsg_state_drop(state);                /* aborting */
1333 }
1334
1335 static
1336 void
1337 kdmsg_state_abort(kdmsg_state_t *state)
1338 {
1339         kdmsg_msg_t *msg;
1340
1341         /*
1342          * Set ABORTING and DYING, return if already set.  If the state was
1343          * just allocated we defer the abort operation until the related
1344          * message is processed.
1345          */
1346         KKASSERT((state->flags & KDMSG_STATE_ABORTING) == 0);
1347         if (state->flags & KDMSG_STATE_ABORTING)
1348                 return;
1349         state->flags |= KDMSG_STATE_ABORTING;
1350         kdmsg_state_dying(state);
1351         if (state->flags & KDMSG_STATE_NEW) {
1352                 kdio_printf(iocom, 5,
1353                             "kdmsg_state_abort(0): state %p rxcmd %08x "
1354                             "txcmd %08x flags %08x - in NEW state\n",
1355                             state, state->rxcmd,
1356                             state->txcmd, state->flags);
1357                 return;
1358         }
1359
1360         /*
1361          * NOTE: The DELETE flag might already be set due to an early
1362          *       termination.
1363          *
1364          * NOTE: Args to kdmsg_msg_alloc() to avoid dynamic state allocation.
1365          *
1366          * NOTE: We are simulating a received message using our state
1367          *       (vs a message generated by the other side using its state),
1368          *       so we must invert DMSGF_REVTRANS and DMSGF_REVCIRC.
1369          */
1370         kdio_printf(iocom, 5, 
1371                     "kdmsg_state_abort(1): state %p rxcmd %08x txcmd %08x\n",
1372                     state, state->rxcmd, state->txcmd);
1373         if ((state->rxcmd & DMSGF_DELETE) == 0) {
1374                 msg = kdmsg_msg_alloc(state, DMSG_LNK_ERROR, NULL, NULL);
1375                 if ((state->rxcmd & DMSGF_CREATE) == 0)
1376                         msg->any.head.cmd |= DMSGF_CREATE;
1377                 msg->any.head.cmd |= DMSGF_DELETE |
1378                                      (state->rxcmd & DMSGF_REPLY);
1379                 msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
1380                 msg->any.head.error = DMSG_ERR_LOSTLINK;
1381                 kdio_printf(iocom, 5,
1382                             "kdmsg_state_abort(a): state %p msgcmd %08x\n",
1383                             state, msg->any.head.cmd);
1384                 /* circuit not initialized */
1385                 lockmgr(&state->iocom->msglk, LK_RELEASE);
1386                 kdmsg_msg_receive_handling(msg);
1387                 lockmgr(&state->iocom->msglk, LK_EXCLUSIVE);
1388                 msg = NULL;
1389         }
1390         kdio_printf(iocom, 5,
1391                     "kdmsg_state_abort(2): state %p rxcmd %08x txcmd %08x\n",
1392                     state, state->rxcmd, state->txcmd);
1393 }
1394
1395 /*
1396  * Recursively sets KDMSG_STATE_DYING on state and all sub-states, preventing
1397  * the transmission of any new messages on these states.  This is done
1398  * atomically when parent state is terminating, whereas setting ABORTING is
1399  * not atomic and can leak races.
1400  */
1401 static
1402 void
1403 kdmsg_state_dying(kdmsg_state_t *state)
1404 {
1405         kdmsg_state_t *scan;
1406
1407         if ((state->flags & KDMSG_STATE_DYING) == 0) {
1408                 state->flags |= KDMSG_STATE_DYING;
1409                 TAILQ_FOREACH(scan, &state->subq, entry)
1410                         kdmsg_state_dying(scan);
1411         }
1412 }
1413
1414 /*
1415  * Process state tracking for a message prior to transmission.
1416  *
1417  * Called with msglk held and the msg dequeued.  Returns non-zero if
1418  * the message is bad and should be deleted by the caller.
1419  *
1420  * One-off messages are usually with dummy state and msg->state may be NULL
1421  * in this situation.
1422  *
1423  * New transactions (when CREATE is set) will insert the state.
1424  *
1425  * May request that caller discard the message by setting *discardp to 1.
1426  * A NULL state may be returned in this case.
1427  */
1428 static
1429 int
1430 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1431 {
1432         kdmsg_iocom_t *iocom = msg->state->iocom;
1433         kdmsg_state_t *state;
1434         int error;
1435
1436         /*
1437          * Make sure a state structure is ready to go in case we need a new
1438          * one.  This is the only routine which uses freewr_state so no
1439          * races are possible.
1440          */
1441         if ((state = iocom->freewr_state) == NULL) {
1442                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1443                 state->flags = KDMSG_STATE_DYNAMIC;
1444                 state->iocom = iocom;
1445                 state->refs = 1;
1446                 TAILQ_INIT(&state->subq);
1447                 iocom->freewr_state = state;
1448         }
1449
1450         /*
1451          * Lock RB tree.  If persistent state is present it will have already
1452          * been assigned to msg.
1453          */
1454         state = msg->state;
1455
1456         /*
1457          * Short-cut one-off or mid-stream messages (state may be NULL).
1458          */
1459         if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1460                                   DMSGF_ABORT)) == 0) {
1461                 return(0);
1462         }
1463
1464
1465         /*
1466          * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1467          * inside the case statements.
1468          */
1469         switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1470                                     DMSGF_REPLY)) {
1471         case DMSGF_CREATE:
1472         case DMSGF_CREATE | DMSGF_DELETE:
1473                 /*
1474                  * Insert the new persistent message state and mark
1475                  * half-closed if DELETE is set.  Since this is a new
1476                  * message it isn't possible to transition into the fully
1477                  * closed state here.
1478                  *
1479                  * XXX state must be assigned and inserted by
1480                  *     kdmsg_msg_write().  txcmd is assigned by us
1481                  *     on-transmit.
1482                  */
1483                 KKASSERT(state != NULL);
1484                 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1485                 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1486                 state->rxcmd = DMSGF_REPLY;
1487                 state->flags &= ~KDMSG_STATE_NEW;
1488                 error = 0;
1489                 break;
1490         case DMSGF_DELETE:
1491                 /*
1492                  * Sent ABORT+DELETE in case where msgid has already
1493                  * been fully closed, ignore the message.
1494                  */
1495                 if (state == &iocom->state0) {
1496                         if (msg->any.head.cmd & DMSGF_ABORT) {
1497                                 error = EALREADY;
1498                         } else {
1499                                 kdio_printf(iocom, 1,
1500                                         "msgtx: no state match "
1501                                         "for DELETE cmd=%08x msgid=%016jx\n",
1502                                         msg->any.head.cmd,
1503                                         (intmax_t)msg->any.head.msgid);
1504                                 error = EINVAL;
1505                         }
1506                         break;
1507                 }
1508
1509                 /*
1510                  * Sent ABORT+DELETE in case where msgid has
1511                  * already been reused for an unrelated message,
1512                  * ignore the message.
1513                  */
1514                 if ((state->txcmd & DMSGF_CREATE) == 0) {
1515                         if (msg->any.head.cmd & DMSGF_ABORT) {
1516                                 error = EALREADY;
1517                         } else {
1518                                 kdio_printf(iocom, 1, "%s\n",
1519                                             "msgtx: state reused "
1520                                             "for DELETE");
1521                                 error = EINVAL;
1522                         }
1523                         break;
1524                 }
1525                 error = 0;
1526                 break;
1527         default:
1528                 /*
1529                  * Check for mid-stream ABORT command sent
1530                  */
1531                 if (msg->any.head.cmd & DMSGF_ABORT) {
1532                         if (state == &state->iocom->state0 ||
1533                             (state->txcmd & DMSGF_CREATE) == 0) {
1534                                 error = EALREADY;
1535                                 break;
1536                         }
1537                 }
1538                 error = 0;
1539                 break;
1540         case DMSGF_REPLY | DMSGF_CREATE:
1541         case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1542                 /*
1543                  * When transmitting a reply with CREATE set the original
1544                  * persistent state message should already exist.
1545                  */
1546                 if (state == &state->iocom->state0) {
1547                         kdio_printf(iocom, 1, "%s\n",
1548                                     "msgtx: no state match "
1549                                     "for REPLY | CREATE");
1550                         error = EINVAL;
1551                         break;
1552                 }
1553                 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1554                 error = 0;
1555                 break;
1556         case DMSGF_REPLY | DMSGF_DELETE:
1557                 /*
1558                  * When transmitting a reply with DELETE set the original
1559                  * persistent state message should already exist.
1560                  *
1561                  * This is very similar to the REPLY|CREATE|* case except
1562                  * txcmd is already stored, so we just add the DELETE flag.
1563                  *
1564                  * Sent REPLY+ABORT+DELETE in case where msgid has
1565                  * already been fully closed, ignore the message.
1566                  */
1567                 if (state == &state->iocom->state0) {
1568                         if (msg->any.head.cmd & DMSGF_ABORT) {
1569                                 error = EALREADY;
1570                         } else {
1571                                 kdio_printf(iocom, 1, "%s\n",
1572                                             "msgtx: no state match "
1573                                             "for REPLY | DELETE");
1574                                 error = EINVAL;
1575                         }
1576                         break;
1577                 }
1578
1579                 /*
1580                  * Sent REPLY+ABORT+DELETE in case where msgid has already
1581                  * been reused for an unrelated message, ignore the message.
1582                  */
1583                 if ((state->txcmd & DMSGF_CREATE) == 0) {
1584                         if (msg->any.head.cmd & DMSGF_ABORT) {
1585                                 error = EALREADY;
1586                         } else {
1587                                 kdio_printf(iocom, 1, "%s\n",
1588                                             "msgtx: state reused "
1589                                             "for REPLY | DELETE");
1590                                 error = EINVAL;
1591                         }
1592                         break;
1593                 }
1594                 error = 0;
1595                 break;
1596         case DMSGF_REPLY:
1597                 /*
1598                  * Check for mid-stream ABORT reply sent.
1599                  *
1600                  * One-off REPLY messages are allowed for e.g. status updates.
1601                  */
1602                 if (msg->any.head.cmd & DMSGF_ABORT) {
1603                         if (state == &state->iocom->state0 ||
1604                             (state->txcmd & DMSGF_CREATE) == 0) {
1605                                 error = EALREADY;
1606                                 break;
1607                         }
1608                 }
1609                 error = 0;
1610                 break;
1611         }
1612
1613         /*
1614          * Set interlock (XXX hack) in case the send side blocks and a
1615          * response is returned before kdmsg_state_cleanuptx() can be
1616          * run.
1617          */
1618         if (state && error == 0)
1619                 state->flags |= KDMSG_STATE_INTERLOCK;
1620
1621         return (error);
1622 }
1623
1624 /*
1625  * Called with iocom locked.
1626  */
1627 static
1628 void
1629 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1630 {
1631         kdmsg_iocom_t *iocom = msg->state->iocom;
1632         kdmsg_state_t *state;
1633
1634         if ((state = msg->state) == NULL) {
1635                 kdmsg_msg_free(msg);
1636                 return;
1637         }
1638
1639         /*
1640          * Clear interlock (XXX hack) in case the send side blocks and a
1641          * response is returned in the other thread before
1642          * kdmsg_state_cleanuptx() can be run.  We maintain our hold on
1643          * iocom->msglk so we can do this before completing our task.
1644          */
1645         if (state->flags & KDMSG_STATE_SIGNAL) {
1646                 kdio_printf(iocom, 1, "state %p interlock!\n", state);
1647                 wakeup(state);
1648         }
1649         state->flags &= ~(KDMSG_STATE_INTERLOCK | KDMSG_STATE_SIGNAL);
1650         kdmsg_state_hold(state);
1651
1652         if (msg->any.head.cmd & DMSGF_DELETE) {
1653                 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1654                 state->txcmd |= DMSGF_DELETE;
1655                 if (state->rxcmd & DMSGF_DELETE) {
1656                         KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1657                         if (state->txcmd & DMSGF_REPLY) {
1658                                 KKASSERT(msg->any.head.cmd &
1659                                          DMSGF_REPLY);
1660                                 RB_REMOVE(kdmsg_state_tree,
1661                                           &iocom->staterd_tree, state);
1662                         } else {
1663                                 KKASSERT((msg->any.head.cmd &
1664                                           DMSGF_REPLY) == 0);
1665                                 RB_REMOVE(kdmsg_state_tree,
1666                                           &iocom->statewr_tree, state);
1667                         }
1668                         state->flags &= ~KDMSG_STATE_RBINSERTED;
1669
1670                         /*
1671                          * The subq recursion is used for parent linking and
1672                          * scanning the topology for aborts, we can only
1673                          * remove leafs.  The circuit is effectively dead now,
1674                          * but topology won't be torn down until all of its
1675                          * children have finished/aborted.
1676                          *
1677                          * This is particularly important for end-point
1678                          * devices which might need to access private data
1679                          * in parent states.  Out of order disconnects can
1680                          * occur if an end-point device is processing a
1681                          * message transaction asynchronously because abort
1682                          * requests are basically synchronous and it probably
1683                          * isn't convenient (or possible) for the end-point
1684                          * to abort an asynchronous operation.
1685                          */
1686                         if (TAILQ_EMPTY(&state->subq))
1687                                 kdmsg_subq_delete(state);
1688                         kdmsg_msg_free(msg);
1689                         kdmsg_state_drop(state);   /* state on rbtree */
1690                 } else {
1691                         kdmsg_msg_free(msg);
1692                 }
1693         } else {
1694                 kdmsg_msg_free(msg);
1695         }
1696
1697         /*
1698          * Deferred abort after transmission.
1699          */
1700         if ((state->flags & (KDMSG_STATE_ABORTING | KDMSG_STATE_DYING)) &&
1701             (state->rxcmd & DMSGF_DELETE) == 0) {
1702                 kdio_printf(iocom, 5,
1703                             "kdmsg_state_cleanuptx: state=%p "
1704                             "executing deferred abort\n",
1705                             state);
1706                 state->flags &= ~KDMSG_STATE_ABORTING;
1707                 kdmsg_state_abort(state);
1708         }
1709         kdmsg_state_drop(state);
1710 }
1711
1712 static
1713 void
1714 _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1715 {
1716         atomic_add_int(&state->refs, 1);
1717 #if KDMSG_DEBUG
1718         kd_printf(4, "state %p +%d\t%s:%d\n", state, state->refs, file, line);
1719 #endif
1720 }
1721
1722 static
1723 void
1724 _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1725 {
1726         KKASSERT(state->refs > 0);
1727 #if KDMSG_DEBUG
1728         kd_printf(4, "state %p -%d\t%s:%d\n", state, state->refs, file, line);
1729 #endif
1730         if (atomic_fetchadd_int(&state->refs, -1) == 1)
1731                 kdmsg_state_free(state);
1732 }
1733
1734 static
1735 void
1736 kdmsg_state_free(kdmsg_state_t *state)
1737 {
1738         kdmsg_iocom_t *iocom = state->iocom;
1739
1740         KKASSERT((state->flags & KDMSG_STATE_RBINSERTED) == 0);
1741         KKASSERT((state->flags & KDMSG_STATE_SUBINSERTED) == 0);
1742         KKASSERT(TAILQ_EMPTY(&state->subq));
1743
1744         if (state != &state->iocom->state0)
1745                 kfree(state, iocom->mmsg);
1746 }
1747
1748 kdmsg_msg_t *
1749 kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
1750                 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1751 {
1752         kdmsg_iocom_t *iocom = state->iocom;
1753         kdmsg_state_t *pstate;
1754         kdmsg_msg_t *msg;
1755         size_t hbytes;
1756
1757         KKASSERT(iocom != NULL);
1758         hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1759         msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1760                       iocom->mmsg, M_WAITOK | M_ZERO);
1761         msg->hdr_size = hbytes;
1762
1763         if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
1764                 /*
1765                  * New transaction, requires tracking state and a unique
1766                  * msgid to be allocated.
1767                  *
1768                  * It is possible to race a circuit failure, inherit the
1769                  * parent's STATE_DYING flag to trigger an abort sequence
1770                  * in the transmit path.  By not inheriting ABORTING the
1771                  * abort sequence can recurse.
1772                  *
1773                  * NOTE: The transactions has not yet been initiated so we
1774                  *       cannot set DMSGF_CREATE/DELETE bits in txcmd or rxcmd.
1775                  *       We have to properly setup DMSGF_REPLY, however.
1776                  */
1777                 pstate = state;
1778                 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1779                 TAILQ_INIT(&state->subq);
1780                 state->iocom = iocom;
1781                 state->parent = pstate;
1782                 state->flags = KDMSG_STATE_DYNAMIC |
1783                                KDMSG_STATE_NEW;
1784                 state->func = func;
1785                 state->any.any = data;
1786                 state->msgid = (uint64_t)(uintptr_t)state;
1787                 /*msg->any.head.msgid = state->msgid;XXX*/
1788
1789                 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1790                 if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1791                         panic("duplicate msgid allocated");
1792                 if (TAILQ_EMPTY(&pstate->subq))
1793                         kdmsg_state_hold(pstate);/* pstate->subq */
1794                 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
1795                 state->flags |= KDMSG_STATE_RBINSERTED |
1796                                 KDMSG_STATE_SUBINSERTED;
1797                 state->flags |= pstate->flags & KDMSG_STATE_DYING;
1798                 kdmsg_state_hold(state);        /* pstate->subq */
1799                 kdmsg_state_hold(state);        /* state on rbtree */
1800                 kdmsg_state_hold(state);        /* msg->state */
1801                 lockmgr(&iocom->msglk, LK_RELEASE);
1802         } else {
1803                 pstate = state->parent;
1804                 KKASSERT(pstate != NULL);
1805                 kdmsg_state_hold(state);        /* msg->state */
1806         }
1807
1808         if (state->flags & KDMSG_STATE_OPPOSITE)
1809                 cmd |= DMSGF_REVTRANS;
1810         if (pstate->flags & KDMSG_STATE_OPPOSITE)
1811                 cmd |= DMSGF_REVCIRC;
1812
1813         msg->any.head.magic = DMSG_HDR_MAGIC;
1814         msg->any.head.cmd = cmd;
1815         msg->any.head.msgid = state->msgid;
1816         msg->any.head.circuit = pstate->msgid;
1817         msg->state = state;
1818
1819         return (msg);
1820 }
1821
1822 void
1823 kdmsg_msg_free(kdmsg_msg_t *msg)
1824 {
1825         kdmsg_iocom_t *iocom = msg->state->iocom;
1826         kdmsg_state_t *state;
1827
1828         if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1829             msg->aux_data && msg->aux_size) {
1830                 kfree(msg->aux_data, iocom->mmsg);
1831                 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1832         }
1833         if ((state = msg->state) != NULL) {
1834                 msg->state = NULL;
1835                 kdmsg_state_drop(state);        /* msg->state */
1836         }
1837         msg->aux_data = NULL;
1838         msg->aux_size = 0;
1839
1840         kfree(msg, iocom->mmsg);
1841 }
1842
1843 void
1844 kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data)
1845 {
1846         if (msg->flags & KDMSG_FLAG_AUXALLOC) {
1847                 data->aux_data = msg->aux_data;
1848                 data->aux_size = msg->aux_size;
1849                 data->iocom = msg->state->iocom;
1850                 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1851         } else {
1852                 data->aux_data = NULL;
1853                 data->aux_size = 0;
1854                 data->iocom = msg->state->iocom;
1855         }
1856 }
1857
1858 void
1859 kdmsg_free_aux_data(kdmsg_data_t *data)
1860 {
1861         if (data->aux_data)
1862                 kfree(data->aux_data, data->iocom->mmsg);
1863 }
1864
1865 /*
1866  * Indexed messages are stored in a red-black tree indexed by their
1867  * msgid.  Only persistent messages are indexed.
1868  */
1869 int
1870 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1871 {
1872         if (state1->iocom < state2->iocom)
1873                 return(-1);
1874         if (state1->iocom > state2->iocom)
1875                 return(1);
1876         if (state1->msgid < state2->msgid)
1877                 return(-1);
1878         if (state1->msgid > state2->msgid)
1879                 return(1);
1880         return(0);
1881 }
1882
1883 /*
1884  * Write a message.  All requisit command flags have been set.
1885  *
1886  * If msg->state is non-NULL the message is written to the existing
1887  * transaction.  msgid will be set accordingly.
1888  *
1889  * If msg->state is NULL and CREATE is set new state is allocated and
1890  * (func, data) is installed.  A msgid is assigned.
1891  *
1892  * If msg->state is NULL and CREATE is not set the message is assumed
1893  * to be a one-way message.  The originator must assign the msgid
1894  * (or leave it 0, which is typical.
1895  *
1896  * This function merely queues the message to the management thread, it
1897  * does not write to the message socket/pipe.
1898  */
1899 void
1900 kdmsg_msg_write(kdmsg_msg_t *msg)
1901 {
1902         kdmsg_iocom_t *iocom = msg->state->iocom;
1903
1904         lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1905         kdmsg_msg_write_locked(iocom, msg);
1906         lockmgr(&iocom->msglk, LK_RELEASE);
1907 }
1908
1909 static void
1910 kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg)
1911 {
1912         kdmsg_state_t *state;
1913
1914         if (msg->state) {
1915                 /*
1916                  * Continuance or termination of existing transaction.
1917                  * The transaction could have been initiated by either end.
1918                  *
1919                  * (Function callback and aux data for the receive side can
1920                  * be replaced or left alone).
1921                  */
1922                 state = msg->state;
1923                 msg->any.head.msgid = state->msgid;
1924         } else {
1925                 /*
1926                  * One-off message (always uses msgid 0 to distinguish
1927                  * between a possibly lost in-transaction message due to
1928                  * competing aborts and a real one-off message?)
1929                  */
1930                 state = NULL;
1931                 msg->any.head.msgid = 0;
1932         }
1933
1934 #if 0
1935         /*
1936          * XXX removed - don't make this a panic, allow the state checks
1937          *     below to catch the situation.
1938          *
1939          * This flag is not set until after the tx thread has drained
1940          * the tx msgq and simulated responses.  After that point the
1941          * txthread is dead and can no longer simulate responses.
1942          *
1943          * Device drivers should never try to send a message once this
1944          * flag is set.  They should have detected (through the state
1945          * closures) that the link is in trouble.
1946          */
1947         if (iocom->flags & KDMSG_IOCOMF_EXITNOACC) {
1948                 lockmgr(&iocom->msglk, LK_RELEASE);
1949                 panic("kdmsg_msg_write: Attempt to write message to "
1950                       "terminated iocom\n");
1951         }
1952 #endif
1953
1954         /*
1955          * For stateful messages, if the circuit is dead or dying we have
1956          * to abort the potentially newly-created state and discard the
1957          * message.
1958          *
1959          * - We must discard the message because the other end will not
1960          *   be expecting any more messages over the dead or dying circuit
1961          *   and might not be able to receive them.
1962          *
1963          * - We abort the state by simulating a failure to generate a fake
1964          *   incoming DELETE.  This will trigger the state callback and allow
1965          *   the device to clean things up and reply, closing the outgoing
1966          *   direction and allowing the state to be freed.
1967          *
1968          * This situation occurs quite often, particularly as SPANs stabilize.
1969          * End-points must do the right thing.
1970          */
1971         if (state) {
1972                 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1973                 if (state->flags & KDMSG_STATE_DYING) {
1974 #if 0
1975                 if ((state->flags & KDMSG_STATE_DYING) ||
1976                     (state->parent->txcmd & DMSGF_DELETE) ||
1977                     (state->parent->flags & KDMSG_STATE_DYING)) {
1978 #endif
1979                         kdio_printf(iocom, 4,
1980                                     "kdmsg_msg_write: Write to dying circuit "
1981                                     "state=%p "
1982                                     "ptxcmd=%08x prxcmd=%08x flags=%08x\n",
1983                                     state,
1984                                     state->parent->rxcmd,
1985                                     state->parent->txcmd,
1986                                     state->parent->flags);
1987                         kdmsg_state_hold(state);
1988                         kdmsg_state_msgtx(msg);
1989                         kdmsg_state_cleanuptx(msg);
1990                         kdmsg_state_drop(state);
1991                         return;
1992                 }
1993         }
1994
1995         /*
1996          * Finish up the msg fields.  Note that msg->aux_size and the
1997          * aux_bytes stored in the message header represent the unaligned
1998          * (actual) bytes of data, but the buffer is sized to an aligned
1999          * size and the CRC is generated over the aligned length.
2000          */
2001         msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
2002         ++iocom->msg_seq;
2003
2004         if (msg->aux_data && msg->aux_size) {
2005                 uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
2006
2007                 msg->any.head.aux_bytes = msg->aux_size;
2008                 msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
2009         }
2010         msg->any.head.hdr_crc = 0;
2011         msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
2012
2013         TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
2014
2015         if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
2016                 atomic_clear_int(&iocom->msg_ctl,
2017                                  KDMSG_CLUSTERCTL_SLEEPING);
2018                 wakeup(&iocom->msg_ctl);
2019         }
2020 }
2021
2022 /*
2023  * Reply to a message and terminate our side of the transaction.
2024  *
2025  * If msg->state is non-NULL we are replying to a one-way message.
2026  */
2027 void
2028 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
2029 {
2030         kdmsg_state_t *state = msg->state;
2031         kdmsg_msg_t *nmsg;
2032         uint32_t cmd;
2033
2034         /*
2035          * Reply with a simple error code and terminate the transaction.
2036          */
2037         cmd = DMSG_LNK_ERROR;
2038
2039         /*
2040          * Check if our direction has even been initiated yet, set CREATE.
2041          *
2042          * Check what direction this is (command or reply direction).  Note
2043          * that txcmd might not have been initiated yet.
2044          *
2045          * If our direction has already been closed we just return without
2046          * doing anything.
2047          */
2048         if (state != &state->iocom->state0) {
2049                 if (state->txcmd & DMSGF_DELETE)
2050                         return;
2051                 if ((state->txcmd & DMSGF_CREATE) == 0)
2052                         cmd |= DMSGF_CREATE;
2053                 if (state->txcmd & DMSGF_REPLY)
2054                         cmd |= DMSGF_REPLY;
2055                 cmd |= DMSGF_DELETE;
2056         } else {
2057                 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2058                         cmd |= DMSGF_REPLY;
2059         }
2060
2061         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2062         nmsg->any.head.error = error;
2063         kdmsg_msg_write(nmsg);
2064 }
2065
2066 /*
2067  * Reply to a message and continue our side of the transaction.
2068  *
2069  * If msg->state is non-NULL we are replying to a one-way message and this
2070  * function degenerates into the same as kdmsg_msg_reply().
2071  */
2072 void
2073 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
2074 {
2075         kdmsg_state_t *state = msg->state;
2076         kdmsg_msg_t *nmsg;
2077         uint32_t cmd;
2078
2079         /*
2080          * Return a simple result code, do NOT terminate the transaction.
2081          */
2082         cmd = DMSG_LNK_ERROR;
2083
2084         /*
2085          * Check if our direction has even been initiated yet, set CREATE.
2086          *
2087          * Check what direction this is (command or reply direction).  Note
2088          * that txcmd might not have been initiated yet.
2089          *
2090          * If our direction has already been closed we just return without
2091          * doing anything.
2092          */
2093         if (state != &state->iocom->state0) {
2094                 if (state->txcmd & DMSGF_DELETE)
2095                         return;
2096                 if ((state->txcmd & DMSGF_CREATE) == 0)
2097                         cmd |= DMSGF_CREATE;
2098                 if (state->txcmd & DMSGF_REPLY)
2099                         cmd |= DMSGF_REPLY;
2100                 /* continuing transaction, do not set MSGF_DELETE */
2101         } else {
2102                 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2103                         cmd |= DMSGF_REPLY;
2104         }
2105
2106         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2107         nmsg->any.head.error = error;
2108         kdmsg_msg_write(nmsg);
2109 }
2110
2111 /*
2112  * Reply to a message and terminate our side of the transaction.
2113  *
2114  * If msg->state is non-NULL we are replying to a one-way message.
2115  */
2116 void
2117 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
2118 {
2119         kdmsg_msg_t *nmsg;
2120         uint32_t cmd;
2121
2122         /*
2123          * Reply with a simple error code and terminate the transaction.
2124          */
2125         cmd = DMSG_LNK_ERROR;
2126
2127         /*
2128          * Check if our direction has even been initiated yet, set CREATE.
2129          *
2130          * Check what direction this is (command or reply direction).  Note
2131          * that txcmd might not have been initiated yet.
2132          *
2133          * If our direction has already been closed we just return without
2134          * doing anything.
2135          */
2136         KKASSERT(state);
2137         if (state->txcmd & DMSGF_DELETE)
2138                 return;
2139         if ((state->txcmd & DMSGF_CREATE) == 0)
2140                 cmd |= DMSGF_CREATE;
2141         if (state->txcmd & DMSGF_REPLY)
2142                 cmd |= DMSGF_REPLY;
2143         cmd |= DMSGF_DELETE;
2144
2145         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2146         nmsg->any.head.error = error;
2147         kdmsg_msg_write(nmsg);
2148 }
2149
2150 /*
2151  * Reply to a message and continue our side of the transaction.
2152  *
2153  * If msg->state is non-NULL we are replying to a one-way message and this
2154  * function degenerates into the same as kdmsg_msg_reply().
2155  */
2156 void
2157 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
2158 {
2159         kdmsg_msg_t *nmsg;
2160         uint32_t cmd;
2161
2162         /*
2163          * Return a simple result code, do NOT terminate the transaction.
2164          */
2165         cmd = DMSG_LNK_ERROR;
2166
2167         /*
2168          * Check if our direction has even been initiated yet, set CREATE.
2169          *
2170          * Check what direction this is (command or reply direction).  Note
2171          * that txcmd might not have been initiated yet.
2172          *
2173          * If our direction has already been closed we just return without
2174          * doing anything.
2175          */
2176         KKASSERT(state);
2177         if (state->txcmd & DMSGF_DELETE)
2178                 return;
2179         if ((state->txcmd & DMSGF_CREATE) == 0)
2180                 cmd |= DMSGF_CREATE;
2181         if (state->txcmd & DMSGF_REPLY)
2182                 cmd |= DMSGF_REPLY;
2183         /* continuing transaction, do not set MSGF_DELETE */
2184
2185         nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2186         nmsg->any.head.error = error;
2187         kdmsg_msg_write(nmsg);
2188 }