2 * Copyright (c) 2012 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * TODO: txcmd CREATE state is deferred by tx msgq, need to calculate
36 * a streaming response. See subr_diskiocom()'s diskiodone().
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/sysctl.h>
53 #include <sys/thread.h>
54 #include <sys/globaldata.h>
55 #include <sys/limits.h>
59 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
61 SYSCTL_NODE(, OID_AUTO, kdmsg, CTLFLAG_RW, 0, "kdmsg");
62 static int kdmsg_debug = 1;
63 SYSCTL_INT(_kdmsg, OID_AUTO, debug, CTLFLAG_RW, &kdmsg_debug, 0,
64 "Set debug level for kernel dmsg layer");
66 #define kd_printf(level, ctl, ...) \
67 if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
69 #define kdio_printf(iocom, level, ctl, ...) \
70 if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
72 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
73 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
74 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
75 static void kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg);
76 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
77 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
78 static void kdmsg_subq_delete(kdmsg_state_t *state);
79 static void kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error);
80 static void kdmsg_state_abort(kdmsg_state_t *state);
81 static void kdmsg_state_dying(kdmsg_state_t *state);
82 static void kdmsg_state_free(kdmsg_state_t *state);
85 #define KDMSG_DEBUG_ARGS , const char *file, int line
86 #define kdmsg_state_hold(state) _kdmsg_state_hold(state, __FILE__, __LINE__)
87 #define kdmsg_state_drop(state) _kdmsg_state_drop(state, __FILE__, __LINE__)
90 #define KDMSG_DEBUG_ARGS
91 #define kdmsg_state_hold(state) _kdmsg_state_hold(state)
92 #define kdmsg_state_drop(state) _kdmsg_state_drop(state)
94 static void _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
95 static void _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
97 static void kdmsg_iocom_thread_rd(void *arg);
98 static void kdmsg_iocom_thread_wr(void *arg);
99 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
101 /*static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);*/
104 * Initialize the roll-up communications structure for a network
105 * messaging session. This function does not install the socket.
108 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
109 struct malloc_type *mmsg,
110 int (*rcvmsg)(kdmsg_msg_t *msg))
112 bzero(iocom, sizeof(*iocom));
113 iocom->handle = handle;
115 iocom->rcvmsg = rcvmsg;
116 iocom->flags = flags;
117 lockinit(&iocom->msglk, "h2msg", 0, 0);
118 TAILQ_INIT(&iocom->msgq);
119 RB_INIT(&iocom->staterd_tree);
120 RB_INIT(&iocom->statewr_tree);
122 iocom->state0.iocom = iocom;
123 iocom->state0.parent = &iocom->state0;
124 TAILQ_INIT(&iocom->state0.subq);
128 * [Re]connect using the passed file pointer. The caller must ref the
129 * fp for us. We own that ref now.
132 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
133 const char *subsysname)
136 * Destroy the current connection
138 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
139 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
140 while (iocom->msgrd_td || iocom->msgwr_td) {
141 wakeup(&iocom->msg_ctl);
142 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
146 * Drop communications descriptor
149 fdrop(iocom->msg_fp);
150 iocom->msg_fp = NULL;
154 * Setup new communications descriptor
159 iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
161 lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
162 NULL, 0, -1, "%s-msgrd", subsysname);
163 lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
164 NULL, 0, -1, "%s-msgwr", subsysname);
165 lockmgr(&iocom->msglk, LK_RELEASE);
169 * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
170 * this function to handle the state machine for LNK_CONN and LNK_SPAN.
172 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
173 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
176 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
177 void (*auto_callback)(kdmsg_msg_t *msg))
181 iocom->auto_callback = auto_callback;
183 msg = kdmsg_msg_alloc(&iocom->state0,
184 DMSG_LNK_CONN | DMSGF_CREATE,
185 kdmsg_lnk_conn_reply, NULL);
186 iocom->auto_lnk_conn.head = msg->any.head;
187 msg->any.lnk_conn = iocom->auto_lnk_conn;
188 iocom->conn_state = msg->state;
189 kdmsg_state_hold(msg->state); /* iocom->conn_state */
190 kdmsg_msg_write(msg);
195 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
197 kdmsg_iocom_t *iocom = state->iocom;
201 * Upon receipt of the LNK_CONN acknowledgement initiate an
202 * automatic SPAN if we were asked to. Used by e.g. xdisk, but
203 * not used by HAMMER2 which must manage more than one transmitted
206 if ((msg->any.head.cmd & DMSGF_CREATE) &&
207 (iocom->flags & KDMSG_IOCOMF_AUTOTXSPAN)) {
208 rmsg = kdmsg_msg_alloc(&iocom->state0,
209 DMSG_LNK_SPAN | DMSGF_CREATE,
210 kdmsg_lnk_span_reply, NULL);
211 iocom->auto_lnk_span.head = rmsg->any.head;
212 rmsg->any.lnk_span = iocom->auto_lnk_span;
213 kdmsg_msg_write(rmsg);
217 * Process shim after the CONN is acknowledged and before the CONN
218 * transaction is deleted. For deletions this gives device drivers
219 * the ability to interlock new operations on the circuit before
220 * it becomes illegal and panics.
222 if (iocom->auto_callback)
223 iocom->auto_callback(msg);
225 if ((state->txcmd & DMSGF_DELETE) == 0 &&
226 (msg->any.head.cmd & DMSGF_DELETE)) {
228 * iocom->conn_state has a state ref, drop it when clearing.
230 if (iocom->conn_state)
231 kdmsg_state_drop(iocom->conn_state);
232 iocom->conn_state = NULL;
233 kdmsg_msg_reply(msg, 0);
241 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
244 * Be sure to process shim before terminating the SPAN
245 * transaction. Gives device drivers the ability to
246 * interlock new operations on the circuit before it
247 * becomes illegal and panics.
249 if (state->iocom->auto_callback)
250 state->iocom->auto_callback(msg);
252 if ((state->txcmd & DMSGF_DELETE) == 0 &&
253 (msg->any.head.cmd & DMSGF_DELETE)) {
254 kdmsg_msg_reply(msg, 0);
260 * Disconnect and clean up
263 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
265 kdmsg_state_t *state;
270 * Ask the cluster controller to go away by setting
271 * KILLRX. Send a PING to get a response to unstick reading
274 * After 10 seconds shitcan the pipe and do an unclean shutdown.
276 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
278 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
279 msg = kdmsg_msg_alloc(&iocom->state0, DMSG_LNK_PING, NULL, NULL);
280 kdmsg_msg_write_locked(iocom, msg);
283 while (iocom->msgrd_td || iocom->msgwr_td) {
284 wakeup(&iocom->msg_ctl);
285 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
286 if (--retries == 0 && iocom->msg_fp) {
287 kdio_printf(iocom, 0, "%s\n",
289 "shitcanning unresponsive pipe");
290 fp_shutdown(iocom->msg_fp, SHUT_RDWR);
291 /* retries allowed to go negative, keep looping */
298 if ((state = iocom->freerd_state) != NULL) {
299 iocom->freerd_state = NULL;
300 kdmsg_state_drop(state);
303 if ((state = iocom->freewr_state) != NULL) {
304 iocom->freewr_state = NULL;
305 kdmsg_state_drop(state);
309 * Drop communications descriptor
312 fdrop(iocom->msg_fp);
313 iocom->msg_fp = NULL;
315 lockmgr(&iocom->msglk, LK_RELEASE);
319 * Cluster controller thread. Perform messaging functions. We have one
320 * thread for the reader and one for the writer. The writer handles
321 * shutdown requests (which should break the reader thread).
325 kdmsg_iocom_thread_rd(void *arg)
327 kdmsg_iocom_t *iocom = arg;
329 kdmsg_msg_t *msg = NULL;
334 while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLRX) == 0) {
336 * Retrieve the message from the pipe or socket.
338 error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
339 NULL, 1, UIO_SYSSPACE);
342 if (hdr.magic != DMSG_HDR_MAGIC) {
343 kdio_printf(iocom, 1, "bad magic: %04x\n", hdr.magic);
347 hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
348 if (hbytes < sizeof(hdr) || hbytes > DMSG_HDR_MAX) {
349 kdio_printf(iocom, 1, "bad header size %zd\n", hbytes);
354 /* XXX messy: mask cmd to avoid allocating state */
355 msg = kdmsg_msg_alloc(&iocom->state0,
356 hdr.cmd & DMSGF_BASECMDMASK,
359 msg->hdr_size = hbytes;
360 if (hbytes > sizeof(hdr)) {
361 error = fp_read(iocom->msg_fp, &msg->any.head + 1,
362 hbytes - sizeof(hdr),
363 NULL, 1, UIO_SYSSPACE);
365 kdio_printf(iocom, 1, "%s\n",
366 "short msg received");
371 msg->aux_size = hdr.aux_bytes;
372 if (msg->aux_size > DMSG_AUX_MAX) {
373 kdio_printf(iocom, 1,
374 "illegal msg payload size %zd\n",
380 abytes = DMSG_DOALIGN(msg->aux_size);
381 msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
382 msg->flags |= KDMSG_FLAG_AUXALLOC;
383 error = fp_read(iocom->msg_fp, msg->aux_data,
384 abytes, NULL, 1, UIO_SYSSPACE);
386 kdio_printf(iocom, 1, "%s\n",
387 "short msg payload received");
392 error = kdmsg_msg_receive_handling(msg);
397 kdio_printf(iocom, 1, "read thread terminating error=%d\n", error);
400 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
405 * Shutdown the socket and set KILLRX for consistency in case the
406 * shutdown was not commanded. Signal the transmit side to shutdown
407 * by setting KILLTX and waking it up.
409 fp_shutdown(iocom->msg_fp, SHUT_RDWR);
410 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
411 KDMSG_CLUSTERCTL_KILLTX);
412 iocom->msgrd_td = NULL;
413 lockmgr(&iocom->msglk, LK_RELEASE);
414 wakeup(&iocom->msg_ctl);
417 * iocom can be ripped out at any time once the lock is
418 * released with msgrd_td set to NULL. The wakeup()s are safe but
427 kdmsg_iocom_thread_wr(void *arg)
429 kdmsg_iocom_t *iocom = arg;
441 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
443 while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0 && error == 0) {
445 * Sleep if no messages pending. Interlock with flag while
448 if (TAILQ_EMPTY(&iocom->msgq)) {
449 atomic_set_int(&iocom->msg_ctl,
450 KDMSG_CLUSTERCTL_SLEEPING);
451 lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
452 atomic_clear_int(&iocom->msg_ctl,
453 KDMSG_CLUSTERCTL_SLEEPING);
456 while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
458 * Remove msg from the transmit queue and do
459 * persist and half-closed state handling.
461 TAILQ_REMOVE(&iocom->msgq, msg, qentry);
463 error = kdmsg_state_msgtx(msg);
464 if (error == EALREADY) {
475 * Dump the message to the pipe or socket.
477 * We have to clean up the message as if the transmit
478 * succeeded even if it failed.
480 lockmgr(&iocom->msglk, LK_RELEASE);
481 error = fp_write(iocom->msg_fp, &msg->any,
482 msg->hdr_size, &res, UIO_SYSSPACE);
483 if (error || res != msg->hdr_size) {
486 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
487 kdmsg_state_cleanuptx(msg);
491 abytes = DMSG_DOALIGN(msg->aux_size);
492 error = fp_write(iocom->msg_fp,
493 msg->aux_data, abytes,
495 if (error || res != abytes) {
498 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
499 kdmsg_state_cleanuptx(msg);
503 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
504 kdmsg_state_cleanuptx(msg);
509 kdio_printf(iocom, 1, "write thread terminating error=%d\n", error);
513 * Shutdown the socket and set KILLTX for consistency in case the
514 * shutdown was not commanded. Signal the receive side to shutdown
515 * by setting KILLRX and waking it up.
517 fp_shutdown(iocom->msg_fp, SHUT_RDWR);
518 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
519 KDMSG_CLUSTERCTL_KILLTX);
520 wakeup(&iocom->msg_ctl);
523 * The transmit thread is responsible for final cleanups, wait
524 * for the receive side to terminate to prevent new received
525 * states from interfering with our cleanup.
527 * Do not set msgwr_td to NULL until we actually exit.
529 while (iocom->msgrd_td) {
530 wakeup(&iocom->msg_ctl);
531 lksleep(iocom, &iocom->msglk, 0, "clstrkt", hz);
535 * We can no longer receive new messages. We must drain the transmit
536 * message queue and simulate received messages to close anay remaining
539 * Loop until all the states are gone and there are no messages
545 while (TAILQ_FIRST(&iocom->msgq) ||
546 RB_ROOT(&iocom->staterd_tree) ||
547 RB_ROOT(&iocom->statewr_tree)) {
549 * Simulate failure for all sub-states of state0.
551 kdmsg_drain_msgq(iocom);
552 kdio_printf(iocom, 2, "%s\n",
553 "simulate failure for all substates of state0");
554 kdmsg_simulate_failure(&iocom->state0, 0, DMSG_ERR_LOSTLINK);
556 lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz / 2);
558 if ((int)(ticks - save_ticks) > hz*2 && didwarn == 0) {
560 kdio_printf(iocom, 0,
561 "Warning, write thread on %p "
562 "still terminating\n",
565 if ((int)(ticks - save_ticks) > hz*15 && didwarn == 1) {
567 kdio_printf(iocom, 0,
568 "Warning, write thread on %p "
569 "still terminating\n",
572 if ((int)(ticks - save_ticks) > hz*60) {
573 kdio_printf(iocom, 0,
574 "Can't terminate: msgq %p "
575 "rd_tree %p wr_tree %p\n",
576 TAILQ_FIRST(&iocom->msgq),
577 RB_ROOT(&iocom->staterd_tree),
578 RB_ROOT(&iocom->statewr_tree));
579 lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz * 10);
584 * Exit handling is done by the write thread.
586 iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
587 lockmgr(&iocom->msglk, LK_RELEASE);
590 * The state trees had better be empty now
592 KKASSERT(RB_EMPTY(&iocom->staterd_tree));
593 KKASSERT(RB_EMPTY(&iocom->statewr_tree));
594 KKASSERT(iocom->conn_state == NULL);
596 if (iocom->exit_func) {
598 * iocom is invalid after we call the exit function.
600 iocom->msgwr_td = NULL;
601 iocom->exit_func(iocom);
604 * iocom can be ripped out from under us once msgwr_td is
605 * set to NULL. The wakeup is safe.
607 iocom->msgwr_td = NULL;
614 * This cleans out the pending transmit message queue, adjusting any
615 * persistent states properly in the process.
617 * Called with iocom locked.
620 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
625 * Clean out our pending transmit queue, executing the
626 * appropriate state adjustments. If this tries to open
627 * any new outgoing transactions we have to loop up and
630 while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
631 TAILQ_REMOVE(&iocom->msgq, msg, qentry);
632 if (kdmsg_state_msgtx(msg))
635 kdmsg_state_cleanuptx(msg);
640 * Do all processing required to handle a freshly received message
641 * after its low level header has been validated.
643 * iocom is not locked.
647 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
649 kdmsg_iocom_t *iocom = msg->state->iocom;
653 * State machine tracking, state assignment for msg,
654 * returns error and discard status. Errors are fatal
655 * to the connection except for EALREADY which forces
656 * a discard without execution.
658 error = kdmsg_state_msgrx(msg);
659 if (msg->state->flags & KDMSG_STATE_ABORTING) {
660 kdio_printf(iocom, 5,
661 "kdmsg_state_abort(b): state %p rxcmd=%08x "
662 "txcmd=%08x msgrx error %d\n",
663 msg->state, msg->state->rxcmd,
664 msg->state->txcmd, error);
668 * Raw protocol or connection error
670 if (msg->state->flags & KDMSG_STATE_ABORTING)
671 kdio_printf(iocom, 5,
672 "X1 state %p error %d\n",
675 if (error == EALREADY)
677 } else if (msg->state && msg->state->func) {
679 * Message related to state which already has a
680 * handling function installed for it.
682 if (msg->state->flags & KDMSG_STATE_ABORTING)
683 kdio_printf(iocom, 5,
684 "X2 state %p func %p\n",
685 msg->state, msg->state->func);
686 error = msg->state->func(msg->state, msg);
687 kdmsg_state_cleanuprx(msg);
688 } else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
689 if (msg->state->flags & KDMSG_STATE_ABORTING)
690 kdio_printf(iocom, 5,
691 "X3 state %p\n", msg->state);
692 error = kdmsg_autorxmsg(msg);
693 kdmsg_state_cleanuprx(msg);
695 if (msg->state->flags & KDMSG_STATE_ABORTING)
696 kdio_printf(iocom, 5,
697 "X4 state %p\n", msg->state);
698 error = iocom->rcvmsg(msg);
699 kdmsg_state_cleanuprx(msg);
705 * Process state tracking for a message after reception and dequeueing,
706 * prior to execution of the state callback. The state is updated and
707 * will be removed from the RBTREE if completely closed, but the state->parent
708 * and subq linkage is not cleaned up until after the callback (see
713 * NOTE: A message transaction can consist of several messages in either
716 * NOTE: The msgid is unique to the initiator, not necessarily unique for
717 * us or for any relay or for the return direction for that matter.
718 * That is, two sides sending a new message can use the same msgid
723 * ABORT sequences work by setting the ABORT flag along with normal message
724 * state. However, ABORTs can also be sent on half-closed messages, that is
725 * even if the command or reply side has already sent a DELETE, as long as
726 * the message has not been fully closed it can still send an ABORT+DELETE
727 * to terminate the half-closed message state.
729 * Since ABORT+DELETEs can race we silently discard ABORT's for message
730 * state which has already been fully closed. REPLY+ABORT+DELETEs can
731 * also race, and in this situation the other side might have already
732 * initiated a new unrelated command with the same message id. Since
733 * the abort has not set the CREATE flag the situation can be detected
734 * and the message will also be discarded.
736 * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
737 * The ABORT request is essentially integrated into the command instead
738 * of being sent later on. In this situation the command implementation
739 * detects that CREATE and ABORT are both set (vs ABORT alone) and can
740 * special-case non-blocking operation for the command.
742 * NOTE! Messages with ABORT set without CREATE or DELETE are considered
743 * to be mid-stream aborts for command/reply sequences. ABORTs on
744 * one-way messages are not supported.
746 * NOTE! If a command sequence does not support aborts the ABORT flag is
751 * One-off messages (no reply expected) are sent with neither CREATE or DELETE
752 * set. One-off messages cannot be aborted and typically aren't processed
753 * by these routines. The REPLY bit can be used to distinguish whether a
754 * one-off message is a command or reply. For example, one-off replies
755 * will typically just contain status updates.
759 kdmsg_state_msgrx(kdmsg_msg_t *msg)
761 kdmsg_iocom_t *iocom = msg->state->iocom;
762 kdmsg_state_t *state;
763 kdmsg_state_t *pstate;
764 kdmsg_state_t sdummy;
767 bzero(&sdummy, sizeof(sdummy)); /* avoid gcc warnings */
770 * Make sure a state structure is ready to go in case we need a new
771 * one. This is the only routine which uses freerd_state so no
772 * races are possible.
774 if ((state = iocom->freerd_state) == NULL) {
775 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
776 state->flags = KDMSG_STATE_DYNAMIC;
777 state->iocom = iocom;
779 TAILQ_INIT(&state->subq);
780 iocom->freerd_state = state;
782 state = NULL; /* safety */
785 * Lock RB tree and locate existing persistent state, if any.
787 * If received msg is a command state is on staterd_tree.
788 * If received msg is a reply state is on statewr_tree.
790 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
793 if (msg->state == &iocom->state0) {
794 sdummy.msgid = msg->any.head.msgid;
795 sdummy.iocom = iocom;
796 if (msg->any.head.cmd & DMSGF_REVTRANS) {
797 state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree,
800 state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree,
805 * Set message state unconditionally. If this is a CREATE
806 * message this state will become the parent state and new
807 * state will be allocated for the message state.
810 state = &iocom->state0;
811 if (state->flags & KDMSG_STATE_INTERLOCK) {
812 state->flags |= KDMSG_STATE_SIGNAL;
813 lksleep(state, &iocom->msglk, 0, "dmrace", hz);
816 kdmsg_state_hold(state);
817 kdmsg_state_drop(msg->state); /* iocom->state0 */
824 * Short-cut one-off or mid-stream messages.
826 if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
827 DMSGF_ABORT)) == 0) {
833 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
834 * inside the case statements.
836 switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
838 case DMSGF_CREATE | DMSGF_DELETE:
840 * New persistant command received.
842 if (state != &iocom->state0) {
843 kdio_printf(iocom, 1, "%s\n",
844 "duplicate transaction");
850 * Lookup the circuit. The circuit is an open transaction.
851 * the REVCIRC bit in the message tells us which side
852 * initiated the transaction representing the circuit.
854 if (msg->any.head.circuit) {
855 sdummy.msgid = msg->any.head.circuit;
857 if (msg->any.head.cmd & DMSGF_REVCIRC) {
858 pstate = RB_FIND(kdmsg_state_tree,
859 &iocom->statewr_tree,
862 pstate = RB_FIND(kdmsg_state_tree,
863 &iocom->staterd_tree,
866 if (pstate == NULL) {
867 kdio_printf(iocom, 1, "%s\n",
874 pstate = &iocom->state0;
878 * Allocate new state.
880 * msg->state becomes the owner of the ref we inherit from
883 kdmsg_state_drop(state);
884 state = iocom->freerd_state;
885 iocom->freerd_state = NULL;
887 msg->state = state; /* inherits freerd ref */
888 state->parent = pstate;
889 KKASSERT(state->iocom == iocom);
890 state->flags |= KDMSG_STATE_RBINSERTED |
891 KDMSG_STATE_SUBINSERTED |
892 KDMSG_STATE_OPPOSITE;
893 if (TAILQ_EMPTY(&pstate->subq))
894 kdmsg_state_hold(pstate);/* states on pstate->subq */
895 kdmsg_state_hold(state); /* state on pstate->subq */
896 kdmsg_state_hold(state); /* state on rbtree */
897 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
898 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
899 state->txcmd = DMSGF_REPLY;
900 state->msgid = msg->any.head.msgid;
901 state->flags &= ~KDMSG_STATE_NEW;
902 RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
903 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
908 * Persistent state is expected but might not exist if an
909 * ABORT+DELETE races the close.
911 if (state == &iocom->state0) {
912 if (msg->any.head.cmd & DMSGF_ABORT) {
913 kdio_printf(iocom, 1, "%s\n",
918 kdio_printf(iocom, 1, "%s\n",
919 "msgrx: no state for DELETE");
926 * Handle another ABORT+DELETE case if the msgid has already
929 if ((state->rxcmd & DMSGF_CREATE) == 0) {
930 if (msg->any.head.cmd & DMSGF_ABORT) {
931 kdio_printf(iocom, 1, "%s\n",
932 "msgrx: state already B");
935 kdio_printf(iocom, 1, "%s\n",
936 "msgrx: state reused for DELETE");
945 * Check for mid-stream ABORT command received, otherwise
948 if (msg->any.head.cmd & DMSGF_ABORT) {
949 if (state == &iocom->state0 ||
950 (state->rxcmd & DMSGF_CREATE) == 0) {
957 case DMSGF_REPLY | DMSGF_CREATE:
958 case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
960 * When receiving a reply with CREATE set the original
961 * persistent state message should already exist.
963 if (state == &iocom->state0) {
964 kdio_printf(iocom, 1,
965 "msgrx: no state match for "
966 "REPLY cmd=%08x msgid=%016jx\n",
968 (intmax_t)msg->any.head.msgid);
972 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
975 case DMSGF_REPLY | DMSGF_DELETE:
977 * Received REPLY+ABORT+DELETE in case where msgid has
978 * already been fully closed, ignore the message.
980 if (state == &iocom->state0) {
981 if (msg->any.head.cmd & DMSGF_ABORT) {
984 kdio_printf(iocom, 1, "%s\n",
985 "msgrx: no state match "
993 * Received REPLY+ABORT+DELETE in case where msgid has
994 * already been reused for an unrelated message,
995 * ignore the message.
997 if ((state->rxcmd & DMSGF_CREATE) == 0) {
998 if (msg->any.head.cmd & DMSGF_ABORT) {
1001 kdio_printf(iocom, 1, "%s\n",
1002 "msgrx: state reused "
1003 "for REPLY|DELETE");
1012 * Check for mid-stream ABORT reply received to sent command.
1014 if (msg->any.head.cmd & DMSGF_ABORT) {
1015 if (state == &iocom->state0 ||
1016 (state->rxcmd & DMSGF_CREATE) == 0) {
1026 * Calculate the easy-switch() transactional command. Represents
1027 * the outer-transaction command for any transaction-create or
1028 * transaction-delete, and the inner message command for any
1029 * non-transaction or inside-transaction command. tcmd will be
1030 * set to 0 if the message state is illegal.
1032 * The two can be told apart because outer-transaction commands
1033 * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
1036 if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
1037 if (state != &iocom->state0) {
1038 msg->tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1039 (msg->any.head.cmd & (DMSGF_CREATE |
1046 msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
1050 * Adjust the state for DELETE handling now, before making the
1051 * callback so we are atomic with other state updates.
1053 * Subq/parent linkages are cleaned up after the callback.
1054 * If an error occurred the message is ignored and state is not
1057 if ((state = msg->state) == NULL || error != 0) {
1058 kdio_printf(iocom, 1,
1059 "msgrx: state=%p error %d\n",
1061 } else if (msg->any.head.cmd & DMSGF_DELETE) {
1062 KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1063 state->rxcmd |= DMSGF_DELETE;
1064 if (state->txcmd & DMSGF_DELETE) {
1065 KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1066 if (state->rxcmd & DMSGF_REPLY) {
1067 KKASSERT(msg->any.head.cmd &
1069 RB_REMOVE(kdmsg_state_tree,
1070 &iocom->statewr_tree, state);
1072 KKASSERT((msg->any.head.cmd &
1074 RB_REMOVE(kdmsg_state_tree,
1075 &iocom->staterd_tree, state);
1077 state->flags &= ~KDMSG_STATE_RBINSERTED;
1078 kdmsg_state_drop(state); /* state on rbtree */
1081 lockmgr(&iocom->msglk, LK_RELEASE);
1087 * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
1088 * This routine must call iocom->rcvmsg() for anything not automatically
1092 kdmsg_autorxmsg(kdmsg_msg_t *msg)
1094 kdmsg_iocom_t *iocom = msg->state->iocom;
1100 * Main switch processes transaction create/delete sequences only.
1101 * Use icmd (DELETEs use DMSG_LNK_ERROR
1103 * NOTE: If processing in-transaction messages you generally want
1104 * an inner switch on msg->any.head.cmd.
1107 cmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1108 (msg->any.head.cmd & (DMSGF_CREATE |
1118 * Received ping, send reply
1120 rep = kdmsg_msg_alloc(msg->state, DMSG_LNK_PING | DMSGF_REPLY,
1122 kdmsg_msg_write(rep);
1124 case DMSG_LNK_PING | DMSGF_REPLY:
1125 /* ignore replies */
1127 case DMSG_LNK_CONN | DMSGF_CREATE:
1128 case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
1130 * Received LNK_CONN transaction. Transmit response and
1131 * leave transaction open, which allows the other end to
1132 * start to the SPAN protocol.
1134 * Handle shim after acknowledging the CONN.
1136 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1137 if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1138 kdmsg_msg_result(msg, 0);
1139 if (iocom->auto_callback)
1140 iocom->auto_callback(msg);
1142 error = iocom->rcvmsg(msg);
1147 case DMSG_LNK_CONN | DMSGF_DELETE:
1149 * This message is usually simulated after a link is lost
1150 * to clean up the transaction.
1152 if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1153 if (iocom->auto_callback)
1154 iocom->auto_callback(msg);
1155 kdmsg_msg_reply(msg, 0);
1157 error = iocom->rcvmsg(msg);
1160 case DMSG_LNK_SPAN | DMSGF_CREATE:
1161 case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1163 * Received LNK_SPAN transaction. We do not have to respond
1164 * (except on termination), but we must leave the transaction
1167 * Handle shim after acknowledging the SPAN.
1169 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1170 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1171 if (iocom->auto_callback)
1172 iocom->auto_callback(msg);
1177 error = iocom->rcvmsg(msg);
1181 case DMSG_LNK_SPAN | DMSGF_DELETE:
1183 * Process shims (auto_callback) before cleaning up the
1184 * circuit structure and closing the transactions. Device
1185 * driver should ensure that the circuit is not used after
1186 * the auto_callback() returns.
1188 * Handle shim before closing the SPAN transaction.
1190 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1191 if (iocom->auto_callback)
1192 iocom->auto_callback(msg);
1193 kdmsg_msg_reply(msg, 0);
1195 error = iocom->rcvmsg(msg);
1200 * Anything unhandled goes into rcvmsg.
1202 * NOTE: Replies to link-level messages initiated by our side
1203 * are handled by the state callback, they are NOT
1206 error = iocom->rcvmsg(msg);
1213 * Post-receive-handling message and state cleanup. This routine is called
1214 * after the state function handling/callback to properly dispose of the
1215 * message and unlink the state's parent/subq linkage if the state is
1216 * completely closed.
1218 * msglk is not held.
1222 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1224 kdmsg_state_t *state = msg->state;
1225 kdmsg_iocom_t *iocom = state->iocom;
1227 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1228 if (state != &iocom->state0) {
1230 * When terminating a transaction (in either direction), all
1231 * sub-states are aborted.
1233 if ((msg->any.head.cmd & DMSGF_DELETE) &&
1234 TAILQ_FIRST(&msg->state->subq)) {
1235 kdio_printf(iocom, 2,
1236 "simulate failure for substates of "
1237 "state %p cmd %08x/%08x\n",
1241 kdmsg_simulate_failure(msg->state,
1242 0, DMSG_ERR_LOSTLINK);
1246 * Once the state is fully closed we can (try to) remove it
1247 * from the subq topology.
1249 if ((state->flags & KDMSG_STATE_SUBINSERTED) &&
1250 (state->rxcmd & DMSGF_DELETE) &&
1251 (state->txcmd & DMSGF_DELETE)) {
1253 * Remove parent linkage if state is completely closed.
1255 kdmsg_subq_delete(state);
1258 kdmsg_msg_free(msg);
1260 lockmgr(&iocom->msglk, LK_RELEASE);
1264 * Remove state from its parent's subq. This can wind up recursively
1265 * dropping the parent upward.
1267 * NOTE: Once we drop the parent, our pstate pointer may become invalid.
1271 kdmsg_subq_delete(kdmsg_state_t *state)
1273 kdmsg_state_t *pstate;
1275 if (state->flags & KDMSG_STATE_SUBINSERTED) {
1276 pstate = state->parent;
1278 if (pstate->scan == state)
1279 pstate->scan = NULL;
1280 TAILQ_REMOVE(&pstate->subq, state, entry);
1281 state->flags &= ~KDMSG_STATE_SUBINSERTED;
1282 state->parent = NULL;
1283 if (TAILQ_EMPTY(&pstate->subq)) {
1284 kdmsg_state_drop(pstate);/* pstate->subq */
1286 pstate = NULL; /* safety */
1287 kdmsg_state_drop(state); /* pstate->subq */
1289 KKASSERT(state->parent == NULL);
1294 * Simulate receiving a message which terminates an active transaction
1295 * state. Our simulated received message must set DELETE and may also
1296 * have to set CREATE. It must also ensure that all fields are set such
1297 * that the receive handling code can find the state (kdmsg_state_msgrx())
1298 * or an endless loop will ensue.
1300 * This is used when the other end of the link is dead so the device driver
1301 * gets a completed transaction for all pending states.
1303 * Called with iocom locked.
1307 kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error)
1309 kdmsg_state_t *substate;
1311 kdmsg_state_hold(state); /* aborting */
1314 * Abort parent state first. Parent will not actually disappear
1315 * until children are gone. Device drivers must handle the situation.
1316 * The advantage of this is that device drivers can flag the situation
1317 * as an interlock against new operations on dying states. And since
1318 * device operations are often asynchronous anyway, this sequence of
1319 * events works out better.
1322 kdmsg_state_abort(state);
1325 * Recurse through any children.
1328 TAILQ_FOREACH(substate, &state->subq, entry) {
1329 if (substate->flags & KDMSG_STATE_ABORTING)
1331 state->scan = substate;
1332 kdmsg_simulate_failure(substate, 1, error);
1333 if (state->scan != substate)
1336 kdmsg_state_drop(state); /* aborting */
1341 kdmsg_state_abort(kdmsg_state_t *state)
1346 * Set ABORTING and DYING, return if already set. If the state was
1347 * just allocated we defer the abort operation until the related
1348 * message is processed.
1350 KKASSERT((state->flags & KDMSG_STATE_ABORTING) == 0);
1351 if (state->flags & KDMSG_STATE_ABORTING)
1353 state->flags |= KDMSG_STATE_ABORTING;
1354 kdmsg_state_dying(state);
1355 if (state->flags & KDMSG_STATE_NEW) {
1356 kdio_printf(iocom, 5,
1357 "kdmsg_state_abort(0): state %p rxcmd %08x "
1358 "txcmd %08x flags %08x - in NEW state\n",
1359 state, state->rxcmd,
1360 state->txcmd, state->flags);
1365 * NOTE: The DELETE flag might already be set due to an early
1368 * NOTE: Args to kdmsg_msg_alloc() to avoid dynamic state allocation.
1370 * NOTE: We are simulating a received message using our state
1371 * (vs a message generated by the other side using its state),
1372 * so we must invert DMSGF_REVTRANS and DMSGF_REVCIRC.
1374 kdio_printf(iocom, 5,
1375 "kdmsg_state_abort(1): state %p rxcmd %08x txcmd %08x\n",
1376 state, state->rxcmd, state->txcmd);
1377 if ((state->rxcmd & DMSGF_DELETE) == 0) {
1378 msg = kdmsg_msg_alloc(state, DMSG_LNK_ERROR, NULL, NULL);
1379 if ((state->rxcmd & DMSGF_CREATE) == 0)
1380 msg->any.head.cmd |= DMSGF_CREATE;
1381 msg->any.head.cmd |= DMSGF_DELETE |
1382 (state->rxcmd & DMSGF_REPLY);
1383 msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
1384 msg->any.head.error = DMSG_ERR_LOSTLINK;
1385 kdio_printf(iocom, 5,
1386 "kdmsg_state_abort(a): state %p msgcmd %08x\n",
1387 state, msg->any.head.cmd);
1388 /* circuit not initialized */
1389 lockmgr(&state->iocom->msglk, LK_RELEASE);
1390 kdmsg_msg_receive_handling(msg);
1391 lockmgr(&state->iocom->msglk, LK_EXCLUSIVE);
1394 kdio_printf(iocom, 5,
1395 "kdmsg_state_abort(2): state %p rxcmd %08x txcmd %08x\n",
1396 state, state->rxcmd, state->txcmd);
1400 * Recursively sets KDMSG_STATE_DYING on state and all sub-states, preventing
1401 * the transmission of any new messages on these states. This is done
1402 * atomically when parent state is terminating, whereas setting ABORTING is
1403 * not atomic and can leak races.
1407 kdmsg_state_dying(kdmsg_state_t *state)
1409 kdmsg_state_t *scan;
1411 if ((state->flags & KDMSG_STATE_DYING) == 0) {
1412 state->flags |= KDMSG_STATE_DYING;
1413 TAILQ_FOREACH(scan, &state->subq, entry)
1414 kdmsg_state_dying(scan);
1419 * Process state tracking for a message prior to transmission.
1421 * Called with msglk held and the msg dequeued. Returns non-zero if
1422 * the message is bad and should be deleted by the caller.
1424 * One-off messages are usually with dummy state and msg->state may be NULL
1425 * in this situation.
1427 * New transactions (when CREATE is set) will insert the state.
1429 * May request that caller discard the message by setting *discardp to 1.
1430 * A NULL state may be returned in this case.
1434 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1436 kdmsg_iocom_t *iocom = msg->state->iocom;
1437 kdmsg_state_t *state;
1441 * Make sure a state structure is ready to go in case we need a new
1442 * one. This is the only routine which uses freewr_state so no
1443 * races are possible.
1445 if ((state = iocom->freewr_state) == NULL) {
1446 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1447 state->flags = KDMSG_STATE_DYNAMIC;
1448 state->iocom = iocom;
1450 TAILQ_INIT(&state->subq);
1451 iocom->freewr_state = state;
1455 * Lock RB tree. If persistent state is present it will have already
1456 * been assigned to msg.
1461 * Short-cut one-off or mid-stream messages (state may be NULL).
1463 if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1464 DMSGF_ABORT)) == 0) {
1470 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1471 * inside the case statements.
1473 switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1476 case DMSGF_CREATE | DMSGF_DELETE:
1478 * Insert the new persistent message state and mark
1479 * half-closed if DELETE is set. Since this is a new
1480 * message it isn't possible to transition into the fully
1481 * closed state here.
1483 * XXX state must be assigned and inserted by
1484 * kdmsg_msg_write(). txcmd is assigned by us
1487 KKASSERT(state != NULL);
1488 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1489 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1490 state->rxcmd = DMSGF_REPLY;
1491 state->flags &= ~KDMSG_STATE_NEW;
1496 * Sent ABORT+DELETE in case where msgid has already
1497 * been fully closed, ignore the message.
1499 if (state == &iocom->state0) {
1500 if (msg->any.head.cmd & DMSGF_ABORT) {
1503 kdio_printf(iocom, 1,
1504 "msgtx: no state match "
1505 "for DELETE cmd=%08x msgid=%016jx\n",
1507 (intmax_t)msg->any.head.msgid);
1514 * Sent ABORT+DELETE in case where msgid has
1515 * already been reused for an unrelated message,
1516 * ignore the message.
1518 if ((state->txcmd & DMSGF_CREATE) == 0) {
1519 if (msg->any.head.cmd & DMSGF_ABORT) {
1522 kdio_printf(iocom, 1, "%s\n",
1523 "msgtx: state reused "
1533 * Check for mid-stream ABORT command sent
1535 if (msg->any.head.cmd & DMSGF_ABORT) {
1536 if (state == &state->iocom->state0 ||
1537 (state->txcmd & DMSGF_CREATE) == 0) {
1544 case DMSGF_REPLY | DMSGF_CREATE:
1545 case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1547 * When transmitting a reply with CREATE set the original
1548 * persistent state message should already exist.
1550 if (state == &state->iocom->state0) {
1551 kdio_printf(iocom, 1, "%s\n",
1552 "msgtx: no state match "
1553 "for REPLY | CREATE");
1557 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1560 case DMSGF_REPLY | DMSGF_DELETE:
1562 * When transmitting a reply with DELETE set the original
1563 * persistent state message should already exist.
1565 * This is very similar to the REPLY|CREATE|* case except
1566 * txcmd is already stored, so we just add the DELETE flag.
1568 * Sent REPLY+ABORT+DELETE in case where msgid has
1569 * already been fully closed, ignore the message.
1571 if (state == &state->iocom->state0) {
1572 if (msg->any.head.cmd & DMSGF_ABORT) {
1575 kdio_printf(iocom, 1, "%s\n",
1576 "msgtx: no state match "
1577 "for REPLY | DELETE");
1584 * Sent REPLY+ABORT+DELETE in case where msgid has already
1585 * been reused for an unrelated message, ignore the message.
1587 if ((state->txcmd & DMSGF_CREATE) == 0) {
1588 if (msg->any.head.cmd & DMSGF_ABORT) {
1591 kdio_printf(iocom, 1, "%s\n",
1592 "msgtx: state reused "
1593 "for REPLY | DELETE");
1602 * Check for mid-stream ABORT reply sent.
1604 * One-off REPLY messages are allowed for e.g. status updates.
1606 if (msg->any.head.cmd & DMSGF_ABORT) {
1607 if (state == &state->iocom->state0 ||
1608 (state->txcmd & DMSGF_CREATE) == 0) {
1618 * Set interlock (XXX hack) in case the send side blocks and a
1619 * response is returned before kdmsg_state_cleanuptx() can be
1622 if (state && error == 0)
1623 state->flags |= KDMSG_STATE_INTERLOCK;
1629 * Called with iocom locked.
1633 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1635 kdmsg_iocom_t *iocom = msg->state->iocom;
1636 kdmsg_state_t *state;
1638 if ((state = msg->state) == NULL) {
1639 kdmsg_msg_free(msg);
1644 * Clear interlock (XXX hack) in case the send side blocks and a
1645 * response is returned in the other thread before
1646 * kdmsg_state_cleanuptx() can be run. We maintain our hold on
1647 * iocom->msglk so we can do this before completing our task.
1649 if (state->flags & KDMSG_STATE_SIGNAL) {
1650 kdio_printf(iocom, 1, "state %p interlock!\n", state);
1653 state->flags &= ~(KDMSG_STATE_INTERLOCK | KDMSG_STATE_SIGNAL);
1654 kdmsg_state_hold(state);
1656 if (msg->any.head.cmd & DMSGF_DELETE) {
1657 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1658 state->txcmd |= DMSGF_DELETE;
1659 if (state->rxcmd & DMSGF_DELETE) {
1660 KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1661 if (state->txcmd & DMSGF_REPLY) {
1662 KKASSERT(msg->any.head.cmd &
1664 RB_REMOVE(kdmsg_state_tree,
1665 &iocom->staterd_tree, state);
1667 KKASSERT((msg->any.head.cmd &
1669 RB_REMOVE(kdmsg_state_tree,
1670 &iocom->statewr_tree, state);
1672 state->flags &= ~KDMSG_STATE_RBINSERTED;
1675 * The subq recursion is used for parent linking and
1676 * scanning the topology for aborts, we can only
1677 * remove leafs. The circuit is effectively dead now,
1678 * but topology won't be torn down until all of its
1679 * children have finished/aborted.
1681 * This is particularly important for end-point
1682 * devices which might need to access private data
1683 * in parent states. Out of order disconnects can
1684 * occur if an end-point device is processing a
1685 * message transaction asynchronously because abort
1686 * requests are basically synchronous and it probably
1687 * isn't convenient (or possible) for the end-point
1688 * to abort an asynchronous operation.
1690 if (TAILQ_EMPTY(&state->subq))
1691 kdmsg_subq_delete(state);
1692 kdmsg_msg_free(msg);
1693 kdmsg_state_drop(state); /* state on rbtree */
1695 kdmsg_msg_free(msg);
1698 kdmsg_msg_free(msg);
1702 * Deferred abort after transmission.
1704 if ((state->flags & (KDMSG_STATE_ABORTING | KDMSG_STATE_DYING)) &&
1705 (state->rxcmd & DMSGF_DELETE) == 0) {
1706 kdio_printf(iocom, 5,
1707 "kdmsg_state_cleanuptx: state=%p "
1708 "executing deferred abort\n",
1710 state->flags &= ~KDMSG_STATE_ABORTING;
1711 kdmsg_state_abort(state);
1713 kdmsg_state_drop(state);
1718 _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1720 atomic_add_int(&state->refs, 1);
1722 kd_printf(4, "state %p +%d\t%s:%d\n", state, state->refs, file, line);
1728 _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1730 KKASSERT(state->refs > 0);
1732 kd_printf(4, "state %p -%d\t%s:%d\n", state, state->refs, file, line);
1734 if (atomic_fetchadd_int(&state->refs, -1) == 1)
1735 kdmsg_state_free(state);
1740 kdmsg_state_free(kdmsg_state_t *state)
1742 kdmsg_iocom_t *iocom = state->iocom;
1744 KKASSERT((state->flags & KDMSG_STATE_RBINSERTED) == 0);
1745 KKASSERT((state->flags & KDMSG_STATE_SUBINSERTED) == 0);
1746 KKASSERT(TAILQ_EMPTY(&state->subq));
1748 if (state != &state->iocom->state0)
1749 kfree(state, iocom->mmsg);
1753 kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
1754 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1756 kdmsg_iocom_t *iocom = state->iocom;
1757 kdmsg_state_t *pstate;
1761 KKASSERT(iocom != NULL);
1762 hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1763 msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1764 iocom->mmsg, M_WAITOK | M_ZERO);
1765 msg->hdr_size = hbytes;
1767 if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
1769 * New transaction, requires tracking state and a unique
1770 * msgid to be allocated.
1772 * It is possible to race a circuit failure, inherit the
1773 * parent's STATE_DYING flag to trigger an abort sequence
1774 * in the transmit path. By not inheriting ABORTING the
1775 * abort sequence can recurse.
1777 * NOTE: The transactions has not yet been initiated so we
1778 * cannot set DMSGF_CREATE/DELETE bits in txcmd or rxcmd.
1779 * We have to properly setup DMSGF_REPLY, however.
1782 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1783 TAILQ_INIT(&state->subq);
1784 state->iocom = iocom;
1785 state->parent = pstate;
1786 state->flags = KDMSG_STATE_DYNAMIC |
1789 state->any.any = data;
1790 state->msgid = (uint64_t)(uintptr_t)state;
1791 /*msg->any.head.msgid = state->msgid;XXX*/
1793 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1794 if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1795 panic("duplicate msgid allocated");
1796 if (TAILQ_EMPTY(&pstate->subq))
1797 kdmsg_state_hold(pstate);/* pstate->subq */
1798 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
1799 state->flags |= KDMSG_STATE_RBINSERTED |
1800 KDMSG_STATE_SUBINSERTED;
1801 state->flags |= pstate->flags & KDMSG_STATE_DYING;
1802 kdmsg_state_hold(state); /* pstate->subq */
1803 kdmsg_state_hold(state); /* state on rbtree */
1804 kdmsg_state_hold(state); /* msg->state */
1805 lockmgr(&iocom->msglk, LK_RELEASE);
1807 pstate = state->parent;
1808 KKASSERT(pstate != NULL);
1809 kdmsg_state_hold(state); /* msg->state */
1812 if (state->flags & KDMSG_STATE_OPPOSITE)
1813 cmd |= DMSGF_REVTRANS;
1814 if (pstate->flags & KDMSG_STATE_OPPOSITE)
1815 cmd |= DMSGF_REVCIRC;
1817 msg->any.head.magic = DMSG_HDR_MAGIC;
1818 msg->any.head.cmd = cmd;
1819 msg->any.head.msgid = state->msgid;
1820 msg->any.head.circuit = pstate->msgid;
1827 kdmsg_msg_free(kdmsg_msg_t *msg)
1829 kdmsg_iocom_t *iocom = msg->state->iocom;
1830 kdmsg_state_t *state;
1832 if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1833 msg->aux_data && msg->aux_size) {
1834 kfree(msg->aux_data, iocom->mmsg);
1835 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1837 if ((state = msg->state) != NULL) {
1839 kdmsg_state_drop(state); /* msg->state */
1841 msg->aux_data = NULL;
1844 kfree(msg, iocom->mmsg);
1848 kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data)
1850 if (msg->flags & KDMSG_FLAG_AUXALLOC) {
1851 data->aux_data = msg->aux_data;
1852 data->aux_size = msg->aux_size;
1853 data->iocom = msg->state->iocom;
1854 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1856 data->aux_data = NULL;
1858 data->iocom = msg->state->iocom;
1863 kdmsg_free_aux_data(kdmsg_data_t *data)
1866 kfree(data->aux_data, data->iocom->mmsg);
1870 * Indexed messages are stored in a red-black tree indexed by their
1871 * msgid. Only persistent messages are indexed.
1874 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1876 if (state1->iocom < state2->iocom)
1878 if (state1->iocom > state2->iocom)
1880 if (state1->msgid < state2->msgid)
1882 if (state1->msgid > state2->msgid)
1888 * Write a message. All requisit command flags have been set.
1890 * If msg->state is non-NULL the message is written to the existing
1891 * transaction. msgid will be set accordingly.
1893 * If msg->state is NULL and CREATE is set new state is allocated and
1894 * (func, data) is installed. A msgid is assigned.
1896 * If msg->state is NULL and CREATE is not set the message is assumed
1897 * to be a one-way message. The originator must assign the msgid
1898 * (or leave it 0, which is typical.
1900 * This function merely queues the message to the management thread, it
1901 * does not write to the message socket/pipe.
1904 kdmsg_msg_write(kdmsg_msg_t *msg)
1906 kdmsg_iocom_t *iocom = msg->state->iocom;
1908 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1909 kdmsg_msg_write_locked(iocom, msg);
1910 lockmgr(&iocom->msglk, LK_RELEASE);
1914 kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg)
1916 kdmsg_state_t *state;
1920 * Continuance or termination of existing transaction.
1921 * The transaction could have been initiated by either end.
1923 * (Function callback and aux data for the receive side can
1924 * be replaced or left alone).
1927 msg->any.head.msgid = state->msgid;
1930 * One-off message (always uses msgid 0 to distinguish
1931 * between a possibly lost in-transaction message due to
1932 * competing aborts and a real one-off message?)
1935 msg->any.head.msgid = 0;
1940 * XXX removed - don't make this a panic, allow the state checks
1941 * below to catch the situation.
1943 * This flag is not set until after the tx thread has drained
1944 * the tx msgq and simulated responses. After that point the
1945 * txthread is dead and can no longer simulate responses.
1947 * Device drivers should never try to send a message once this
1948 * flag is set. They should have detected (through the state
1949 * closures) that the link is in trouble.
1951 if (iocom->flags & KDMSG_IOCOMF_EXITNOACC) {
1952 lockmgr(&iocom->msglk, LK_RELEASE);
1953 panic("kdmsg_msg_write: Attempt to write message to "
1954 "terminated iocom\n");
1959 * For stateful messages, if the circuit is dead or dying we have
1960 * to abort the potentially newly-created state and discard the
1963 * - We must discard the message because the other end will not
1964 * be expecting any more messages over the dead or dying circuit
1965 * and might not be able to receive them.
1967 * - We abort the state by simulating a failure to generate a fake
1968 * incoming DELETE. This will trigger the state callback and allow
1969 * the device to clean things up and reply, closing the outgoing
1970 * direction and allowing the state to be freed.
1972 * This situation occurs quite often, particularly as SPANs stabilize.
1973 * End-points must do the right thing.
1976 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1977 if (state->flags & KDMSG_STATE_DYING) {
1979 if ((state->flags & KDMSG_STATE_DYING) ||
1980 (state->parent->txcmd & DMSGF_DELETE) ||
1981 (state->parent->flags & KDMSG_STATE_DYING)) {
1983 kdio_printf(iocom, 4,
1984 "kdmsg_msg_write: Write to dying circuit "
1986 "ptxcmd=%08x prxcmd=%08x flags=%08x\n",
1988 state->parent->rxcmd,
1989 state->parent->txcmd,
1990 state->parent->flags);
1991 kdmsg_state_hold(state);
1992 kdmsg_state_msgtx(msg);
1993 kdmsg_state_cleanuptx(msg);
1994 kdmsg_state_drop(state);
2000 * Finish up the msg fields. Note that msg->aux_size and the
2001 * aux_bytes stored in the message header represent the unaligned
2002 * (actual) bytes of data, but the buffer is sized to an aligned
2003 * size and the CRC is generated over the aligned length.
2005 msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
2008 if (msg->aux_data && msg->aux_size) {
2009 uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
2011 msg->any.head.aux_bytes = msg->aux_size;
2012 msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
2014 msg->any.head.hdr_crc = 0;
2015 msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
2017 TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
2019 if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
2020 atomic_clear_int(&iocom->msg_ctl,
2021 KDMSG_CLUSTERCTL_SLEEPING);
2022 wakeup(&iocom->msg_ctl);
2027 * Reply to a message and terminate our side of the transaction.
2029 * If msg->state is non-NULL we are replying to a one-way message.
2032 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
2034 kdmsg_state_t *state = msg->state;
2039 * Reply with a simple error code and terminate the transaction.
2041 cmd = DMSG_LNK_ERROR;
2044 * Check if our direction has even been initiated yet, set CREATE.
2046 * Check what direction this is (command or reply direction). Note
2047 * that txcmd might not have been initiated yet.
2049 * If our direction has already been closed we just return without
2052 if (state != &state->iocom->state0) {
2053 if (state->txcmd & DMSGF_DELETE)
2055 if ((state->txcmd & DMSGF_CREATE) == 0)
2056 cmd |= DMSGF_CREATE;
2057 if (state->txcmd & DMSGF_REPLY)
2059 cmd |= DMSGF_DELETE;
2061 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2065 nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2066 nmsg->any.head.error = error;
2067 kdmsg_msg_write(nmsg);
2071 * Reply to a message and continue our side of the transaction.
2073 * If msg->state is non-NULL we are replying to a one-way message and this
2074 * function degenerates into the same as kdmsg_msg_reply().
2077 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
2079 kdmsg_state_t *state = msg->state;
2084 * Return a simple result code, do NOT terminate the transaction.
2086 cmd = DMSG_LNK_ERROR;
2089 * Check if our direction has even been initiated yet, set CREATE.
2091 * Check what direction this is (command or reply direction). Note
2092 * that txcmd might not have been initiated yet.
2094 * If our direction has already been closed we just return without
2097 if (state != &state->iocom->state0) {
2098 if (state->txcmd & DMSGF_DELETE)
2100 if ((state->txcmd & DMSGF_CREATE) == 0)
2101 cmd |= DMSGF_CREATE;
2102 if (state->txcmd & DMSGF_REPLY)
2104 /* continuing transaction, do not set MSGF_DELETE */
2106 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2110 nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2111 nmsg->any.head.error = error;
2112 kdmsg_msg_write(nmsg);
2116 * Reply to a message and terminate our side of the transaction.
2118 * If msg->state is non-NULL we are replying to a one-way message.
2121 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
2127 * Reply with a simple error code and terminate the transaction.
2129 cmd = DMSG_LNK_ERROR;
2132 * Check if our direction has even been initiated yet, set CREATE.
2134 * Check what direction this is (command or reply direction). Note
2135 * that txcmd might not have been initiated yet.
2137 * If our direction has already been closed we just return without
2141 if (state->txcmd & DMSGF_DELETE)
2143 if ((state->txcmd & DMSGF_CREATE) == 0)
2144 cmd |= DMSGF_CREATE;
2145 if (state->txcmd & DMSGF_REPLY)
2147 cmd |= DMSGF_DELETE;
2149 nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2150 nmsg->any.head.error = error;
2151 kdmsg_msg_write(nmsg);
2155 * Reply to a message and continue our side of the transaction.
2157 * If msg->state is non-NULL we are replying to a one-way message and this
2158 * function degenerates into the same as kdmsg_msg_reply().
2161 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
2167 * Return a simple result code, do NOT terminate the transaction.
2169 cmd = DMSG_LNK_ERROR;
2172 * Check if our direction has even been initiated yet, set CREATE.
2174 * Check what direction this is (command or reply direction). Note
2175 * that txcmd might not have been initiated yet.
2177 * If our direction has already been closed we just return without
2181 if (state->txcmd & DMSGF_DELETE)
2183 if ((state->txcmd & DMSGF_CREATE) == 0)
2184 cmd |= DMSGF_CREATE;
2185 if (state->txcmd & DMSGF_REPLY)
2187 /* continuing transaction, do not set MSGF_DELETE */
2189 nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2190 nmsg->any.head.error = error;
2191 kdmsg_msg_write(nmsg);