cluster - xdisk automatic BIO restart
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 5 Dec 2012 03:24:54 +0000 (19:24 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 5 Dec 2012 03:24:54 +0000 (19:24 -0800)
* The xdisk driver now detects failed transactions due to failed circuits
  and will restart the BIOs on another circuit or hold onto them until
  connectivity is restored and a new circuit is reforged.

  Automatic restarts only occur if the xa* disk is open()'d (i.e. mounted
  or being accessed by userland).  Kernel disk subsystem probes on attach
  will be failed normally and not stall on lost connectivity.

* subr_diskiocom now reports the correct DMSG error code for failed BIOs
  instead of reporting a kernel error code.

sys/dev/disk/xdisk/xdisk.c
sys/kern/subr_diskiocom.c
sys/sys/dmsg.h

index ef891f5..8cae000 100644 (file)
@@ -500,8 +500,6 @@ xa_terminate_check(struct xa_softc *xa)
                        tag->bio = NULL;
                        bio->bio_buf->b_error = ENXIO;
                        bio->bio_buf->b_flags |= B_ERROR;
-                       if (bio->bio_buf->b_cmd == BUF_CMD_FLUSH)
-                               kprintf("xa_terminate: bio flush terminated tag %p\n", tag);
                        biodone(bio);
                }
                TAILQ_INSERT_TAIL(&xa->tag_freeq, tag, entry);
@@ -581,7 +579,7 @@ xa_autodmsg(kdmsg_msg_t *msg)
                break;
        case DMSG_LNK_CIRC | DMSGF_DELETE | DMSGF_REPLY:
                /*
-                * Losing virtual circuit.  Scan pending tags.
+                * Losing virtual circuit.  Remove the circ from contention.
                 */
                circ = msg->state->any.circ;
                lwkt_gettoken(&xa->tok);
@@ -589,6 +587,7 @@ xa_autodmsg(kdmsg_msg_t *msg)
                        TAILQ_REMOVE(&xa->circq, circ, entry);
                        circ->recorded = 0;
                }
+               xa_restart_deferred(xa);
                lwkt_reltoken(&xa->tok);
                break;
        default:
@@ -764,11 +763,7 @@ xa_strategy(struct dev_strategy_args *ap)
         * only if the device is not open.  That is, we allow the disk
         * probe code prior to mount to fail.
         */
-       if (bio->bio_buf->b_cmd == BUF_CMD_FLUSH)
-               kprintf("xa: flush strategy\n");
        if (xa->attached == 0 && xa->opencnt == 0) {
-               if (bio->bio_buf->b_cmd == BUF_CMD_FLUSH)
-                       kprintf("xa: flush error\n");
                bio->bio_buf->b_error = ENXIO;
                bio->bio_buf->b_flags |= B_ERROR;
                biodone(bio);
@@ -776,14 +771,8 @@ xa_strategy(struct dev_strategy_args *ap)
        }
 
        tag = xa_setup_cmd(xa, bio);
-       if (tag) {
-               if (bio->bio_buf->b_cmd == BUF_CMD_FLUSH)
-                       kprintf("xa: flush start xa %p tag %p\n", xa, tag);
+       if (tag)
                xa_start(tag, NULL);
-       } else {
-               if (bio->bio_buf->b_cmd == BUF_CMD_FLUSH)
-                       kprintf("xa: flush defer\n");
-       }
        return(0);
 }
 
@@ -820,7 +809,11 @@ xa_setup_cmd(xa_softc_t *xa, struct bio *bio)
         * Only get a tag if we have a valid virtual circuit to the server.
         */
        lwkt_gettoken(&xa->tok);
-       if ((circ = TAILQ_FIRST(&xa->circq)) == NULL || xa->attached <= 0) {
+       TAILQ_FOREACH(circ, &xa->circq, entry) {
+               if (circ->lost == 0)
+                       break;
+       }
+       if (circ == NULL || xa->attached <= 0) {
                tag = NULL;
        } else if ((tag = TAILQ_FIRST(&xa->tag_freeq)) != NULL) {
                TAILQ_REMOVE(&xa->tag_freeq, tag, entry);
@@ -990,7 +983,7 @@ static int
 xa_bio_completion(kdmsg_state_t *state, kdmsg_msg_t *msg)
 {
        xa_tag_t *tag = state->any.any;
-       /*xa_softc_t *xa = tag->xa;*/
+       xa_softc_t *xa = tag->xa;
        struct bio *bio;
        struct buf *bp;
 
@@ -1020,6 +1013,19 @@ xa_bio_completion(kdmsg_state_t *state, kdmsg_msg_t *msg)
        }
 
        /*
+        * Potentially move the bio back onto the pending queue if the
+        * device is open and the error is related to losing the virtual
+        * circuit.
+        */
+       if (tag->status.head.error &&
+           (msg->any.head.cmd & DMSGF_DELETE) && xa->opencnt) {
+               if (tag->status.head.error == DMSG_ERR_LOSTLINK ||
+                   tag->status.head.error == DMSG_ERR_CANTCIRC) {
+                       goto handle_repend;
+               }
+       }
+
+       /*
         * Process bio completion
         *
         * For reads any returned data is zero-extended if necessary, so
@@ -1051,9 +1057,6 @@ xa_bio_completion(kdmsg_state_t *state, kdmsg_msg_t *msg)
                } else {
                        bp->b_resid = 0;
                }
-               if (bio->bio_buf->b_cmd == BUF_CMD_FLUSH)
-                       kprintf("xa_bio_completion of flush tag %p bio %p\n",
-                               tag, bio);
                biodone(bio);
                tag->bio = NULL;
                break;
@@ -1071,11 +1074,38 @@ xa_bio_completion(kdmsg_state_t *state, kdmsg_msg_t *msg)
 handle_done:
        if (msg->any.head.cmd & DMSGF_DELETE) {
                xa_done(tag, 1);
-               if ((state->txcmd & DMSGF_DELETE) == 0) {
+               if ((state->txcmd & DMSGF_DELETE) == 0)
                        kdmsg_msg_reply(msg, 0);
-               }
        }
        return (0);
+
+       /*
+        * Handle the case where the transaction failed due to a
+        * connectivity issue.  The tag is put away with wasbio=0
+        * and we restart the bio.
+        *
+        * Setting circ->lost causes xa_setup_cmd() to skip the circuit.
+        * Other circuits might still be live.  Once a circuit gets messed
+        * up it will (eventually) be deleted so we can simply leave (lost)
+        * set forever after.
+        */
+handle_repend:
+       lwkt_gettoken(&xa->tok);
+       kprintf("BIO CIRC FAILURE, REPEND BIO %p\n", bio);
+       tag->circ->lost = 1;
+       tag->bio = NULL;
+       xa_done(tag, 0);
+       if ((state->txcmd & DMSGF_DELETE) == 0)
+               kdmsg_msg_reply(msg, 0);
+
+       /*
+        * Restart or requeue the bio
+        */
+       tag = xa_setup_cmd(xa, bio);
+       if (tag)
+               xa_start(tag, NULL);
+       lwkt_reltoken(&xa->tok);
+       return (0);
 }
 
 /*
index 345713a..1101392 100644 (file)
@@ -582,10 +582,17 @@ diskiodone(struct bio *bio)
        default:
                panic("diskiodone: Unknown bio cmd = %d\n",
                      bio->bio_buf->b_cmd);
-               break; /* NOT REACHED */
+               error = 0;      /* avoid compiler warning */
+               break;          /* NOT REACHED */
        }
 
        /*
+        * Convert error to DMSG_ERR_* code.
+        */
+       if (error)
+               error = DMSG_ERR_IO;
+
+       /*
         * Convert LNK_ERROR or BLK_ERROR if non-zero resid.  READS will
         * have already converted cmd to BLK_ERROR and set up data to return.
         */
index 9f4501b..2e9562a 100644 (file)
@@ -834,6 +834,7 @@ struct kdmsg_circuit {
        uint64_t                msgid;
        int                     weight;
        int                     recorded;       /* written by shim */
+       int                     lost;           /* written by shim */
        int                     refs;           /* written by shim */
 };