cluster - Major kernel component work (diskiocom, xdisk, kdmsg)
[dragonfly.git] / sys / dev / disk / xdisk / xdisk.c
CommitLineData
ddfbb283
MD
1/*
2 * Copyright (c) 2012 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34/*
35 * This module allows disk devices to be created and associated with a
36 * communications pipe or socket. You open the device and issue an
37 * ioctl() to install a new disk along with its communications descriptor.
38 *
39 * All further communication occurs via the descriptor using the DMSG
40 * LNK_CONN, LNK_SPAN, and BLOCK protocols. The descriptor can be a
41 * direct connection to a remote machine's disk (in-kernenl), to a remote
42 * cluster controller, to the local cluster controller, etc.
43 *
44 * /dev/xdisk is the control device, issue ioctl()s to create the /dev/xa%d
45 * devices. These devices look like raw disks to the system.
0c98b966
MD
46 *
47 * TODO:
48 * Handle circuit disconnects, leave bio's pending
49 * Restart bio's on circuit reconnect.
ddfbb283
MD
50 */
51#include <sys/param.h>
52#include <sys/systm.h>
53#include <sys/buf.h>
54#include <sys/conf.h>
55#include <sys/device.h>
56#include <sys/devicestat.h>
57#include <sys/disk.h>
58#include <sys/kernel.h>
59#include <sys/malloc.h>
60#include <sys/sysctl.h>
61#include <sys/proc.h>
62#include <sys/queue.h>
63#include <sys/udev.h>
64#include <sys/uuid.h>
65#include <sys/kern_syscall.h>
66
67#include <sys/dmsg.h>
68#include <sys/xdiskioctl.h>
69
70#include <sys/buf2.h>
71#include <sys/thread2.h>
72
0f50fb46
MD
73struct xa_softc;
74
75struct xa_tag {
76 TAILQ_ENTRY(xa_tag) entry;
77 struct xa_softc *xa;
78 dmsg_blk_error_t status;
79 kdmsg_state_t *state;
8d6d37b8 80 kdmsg_circuit_t *circ;
0f50fb46 81 struct bio *bio;
0f50fb46
MD
82 int running; /* transaction running */
83 int waitseq; /* streaming reply */
84 int done; /* final (transaction closed) */
85};
86
87typedef struct xa_tag xa_tag_t;
88
89struct xa_softc {
90 TAILQ_ENTRY(xa_softc) entry;
91 cdev_t dev;
92 kdmsg_iocom_t iocom;
93 struct xdisk_attach_ioctl xaioc;
94 struct disk_info info;
95 struct disk disk;
96 uuid_t pfs_fsid;
97 int unit;
98 int serializing;
99 int attached;
100 int opencnt;
101 uint64_t keyid;
102 xa_tag_t *opentag;
103 TAILQ_HEAD(, bio) bioq;
104 TAILQ_HEAD(, xa_tag) tag_freeq;
105 TAILQ_HEAD(, xa_tag) tag_pendq;
106 TAILQ_HEAD(, kdmsg_circuit) circq;
107 struct lwkt_token tok;
108};
109
110typedef struct xa_softc xa_softc_t;
111
112#define MAXTAGS 64 /* no real limit */
113
ddfbb283 114static int xdisk_attach(struct xdisk_attach_ioctl *xaioc);
0f50fb46 115static int xdisk_detach(struct xdisk_attach_ioctl *xaioc);
ddfbb283 116static void xa_exit(kdmsg_iocom_t *iocom);
0f50fb46
MD
117static void xa_terminate_check(struct xa_softc *xa);
118static int xa_rcvdmsg(kdmsg_msg_t *msg);
119static void xa_autodmsg(kdmsg_msg_t *msg);
120
121static xa_tag_t *xa_setup_cmd(xa_softc_t *xa, struct bio *bio);
122static void xa_start(xa_tag_t *tag, kdmsg_msg_t *msg);
123static uint32_t xa_wait(xa_tag_t *tag, int seq);
124static void xa_done(xa_tag_t *tag, int wasbio);
125static int xa_sync_completion(kdmsg_state_t *state, kdmsg_msg_t *msg);
126static int xa_bio_completion(kdmsg_state_t *state, kdmsg_msg_t *msg);
0c98b966 127static void xa_restart_deferred(xa_softc_t *xa);
ddfbb283
MD
128
129MALLOC_DEFINE(M_XDISK, "Networked disk client", "Network Disks");
130
131/*
132 * Control device, issue ioctls to create xa devices.
133 */
134static d_open_t xdisk_open;
135static d_close_t xdisk_close;
136static d_ioctl_t xdisk_ioctl;
137
138static struct dev_ops xdisk_ops = {
0f50fb46 139 { "xdisk", 0, D_MPSAFE | D_TRACKCLOSE },
ddfbb283
MD
140 .d_open = xdisk_open,
141 .d_close = xdisk_close,
142 .d_ioctl = xdisk_ioctl
143};
144
145/*
146 * XA disk devices
147 */
148static d_open_t xa_open;
149static d_close_t xa_close;
150static d_ioctl_t xa_ioctl;
151static d_strategy_t xa_strategy;
152static d_psize_t xa_size;
153
154static struct dev_ops xa_ops = {
0f50fb46 155 { "xa", 0, D_DISK | D_CANFREE | D_MPSAFE | D_TRACKCLOSE },
ddfbb283
MD
156 .d_open = xa_open,
157 .d_close = xa_close,
158 .d_ioctl = xa_ioctl,
159 .d_read = physread,
160 .d_write = physwrite,
161 .d_strategy = xa_strategy,
162 .d_psize = xa_size
163};
164
ddfbb283
MD
165static struct lwkt_token xdisk_token = LWKT_TOKEN_INITIALIZER(xdisk_token);
166static int xdisk_opencount;
167static cdev_t xdisk_dev;
168static TAILQ_HEAD(, xa_softc) xa_queue;
169
170/*
171 * Module initialization
172 */
173static int
174xdisk_modevent(module_t mod, int type, void *data)
175{
176 switch (type) {
177 case MOD_LOAD:
178 TAILQ_INIT(&xa_queue);
179 xdisk_dev = make_dev(&xdisk_ops, 0,
180 UID_ROOT, GID_WHEEL, 0600, "xdisk");
181 break;
182 case MOD_UNLOAD:
183 case MOD_SHUTDOWN:
184 if (xdisk_opencount || TAILQ_FIRST(&xa_queue))
185 return (EBUSY);
186 if (xdisk_dev) {
187 destroy_dev(xdisk_dev);
188 xdisk_dev = NULL;
189 }
190 dev_ops_remove_all(&xdisk_ops);
191 dev_ops_remove_all(&xa_ops);
192 break;
193 default:
194 break;
195 }
196 return 0;
197}
198
199DEV_MODULE(xdisk, xdisk_modevent, 0);
200
201/*
202 * Control device
203 */
204static int
205xdisk_open(struct dev_open_args *ap)
206{
207 lwkt_gettoken(&xdisk_token);
208 ++xdisk_opencount;
209 lwkt_reltoken(&xdisk_token);
210 return(0);
211}
212
213static int
214xdisk_close(struct dev_close_args *ap)
215{
216 lwkt_gettoken(&xdisk_token);
217 --xdisk_opencount;
218 lwkt_reltoken(&xdisk_token);
219 return(0);
220}
221
222static int
223xdisk_ioctl(struct dev_ioctl_args *ap)
224{
225 int error;
226
227 switch(ap->a_cmd) {
228 case XDISKIOCATTACH:
229 error = xdisk_attach((void *)ap->a_data);
230 break;
0f50fb46
MD
231 case XDISKIOCDETACH:
232 error = xdisk_detach((void *)ap->a_data);
233 break;
ddfbb283
MD
234 default:
235 error = ENOTTY;
236 break;
237 }
238 return error;
239}
240
241/************************************************************************
242 * DMSG INTERFACE *
243 ************************************************************************/
244
245static int
246xdisk_attach(struct xdisk_attach_ioctl *xaioc)
247{
0f50fb46
MD
248 xa_softc_t *xa;
249 xa_tag_t *tag;
ddfbb283 250 struct file *fp;
ddfbb283 251 int unit;
0f50fb46 252 int n;
ddfbb283
MD
253 char devname[64];
254 cdev_t dev;
255
0f50fb46
MD
256 /*
257 * Normalize ioctl params
258 */
ddfbb283
MD
259 fp = holdfp(curproc->p_fd, xaioc->fd, -1);
260 if (fp == NULL)
261 return EINVAL;
0f50fb46
MD
262 if (xaioc->cl_label[sizeof(xaioc->cl_label) - 1] != 0)
263 return EINVAL;
264 if (xaioc->fs_label[sizeof(xaioc->fs_label) - 1] != 0)
265 return EINVAL;
266 if (xaioc->blksize < DEV_BSIZE || xaioc->blksize > MAXBSIZE)
267 return EINVAL;
ddfbb283
MD
268
269 /*
0f50fb46
MD
270 * See if the serial number is already present. If we are
271 * racing a termination the disk subsystem may still have
272 * duplicate entries not yet removed so we wait a bit and
273 * retry.
ddfbb283
MD
274 */
275 lwkt_gettoken(&xdisk_token);
0f50fb46
MD
276again:
277 TAILQ_FOREACH(xa, &xa_queue, entry) {
278 if (strcmp(xa->iocom.auto_lnk_conn.fs_label,
279 xaioc->fs_label) == 0) {
280 if (xa->serializing) {
281 tsleep(xa, 0, "xadelay", hz / 10);
282 goto again;
283 }
284 xa->serializing = 1;
285 kdmsg_iocom_uninit(&xa->iocom);
286 break;
287 }
288 }
289
290 /*
291 * Create a new xa if not already present
292 */
293 if (xa == NULL) {
294 unit = 0;
295 for (;;) {
296 TAILQ_FOREACH(xa, &xa_queue, entry) {
297 if (xa->unit == unit)
298 break;
299 }
300 if (xa == NULL)
ddfbb283 301 break;
0f50fb46 302 ++unit;
ddfbb283 303 }
0f50fb46 304 xa = kmalloc(sizeof(*xa), M_XDISK, M_WAITOK|M_ZERO);
0f50fb46
MD
305 xa->unit = unit;
306 xa->serializing = 1;
307 lwkt_token_init(&xa->tok, "xa");
308 TAILQ_INIT(&xa->circq);
309 TAILQ_INIT(&xa->bioq);
310 TAILQ_INIT(&xa->tag_freeq);
311 TAILQ_INIT(&xa->tag_pendq);
312 for (n = 0; n < MAXTAGS; ++n) {
313 tag = kmalloc(sizeof(*tag), M_XDISK, M_WAITOK|M_ZERO);
314 tag->xa = xa;
315 TAILQ_INSERT_TAIL(&xa->tag_freeq, tag, entry);
316 }
317 TAILQ_INSERT_TAIL(&xa_queue, xa, entry);
318 }
8d6d37b8
MD
319
320 /*
321 * (xa) is now serializing.
322 */
ddfbb283 323 xa->xaioc = *xaioc;
0f50fb46 324 xa->attached = 1;
ddfbb283
MD
325 lwkt_reltoken(&xdisk_token);
326
327 /*
328 * Create device
329 */
0f50fb46
MD
330 if (xa->dev == NULL) {
331 dev = disk_create(unit, &xa->disk, &xa_ops);
332 dev->si_drv1 = xa;
333 xa->dev = dev;
334 }
ddfbb283 335
0f50fb46
MD
336 xa->info.d_media_blksize = xaioc->blksize;
337 xa->info.d_media_blocks = xaioc->bytes / xaioc->blksize;
ddfbb283
MD
338 xa->info.d_dsflags = DSO_MBRQUIET | DSO_RAWPSIZE;
339 xa->info.d_secpertrack = 32;
340 xa->info.d_nheads = 64;
341 xa->info.d_secpercyl = xa->info.d_secpertrack * xa->info.d_nheads;
342 xa->info.d_ncylinders = 0;
0f50fb46
MD
343 if (xa->xaioc.fs_label[0])
344 xa->info.d_serialno = xa->xaioc.fs_label;
ddfbb283
MD
345
346 /*
347 * Set up messaging connection
348 */
349 ksnprintf(devname, sizeof(devname), "xa%d", unit);
0f50fb46
MD
350 kdmsg_iocom_init(&xa->iocom, xa,
351 KDMSG_IOCOMF_AUTOCONN |
352 KDMSG_IOCOMF_AUTOSPAN |
353 KDMSG_IOCOMF_AUTOCIRC |
354 KDMSG_IOCOMF_AUTOFORGE,
355 M_XDISK, xa_rcvdmsg);
ddfbb283 356 xa->iocom.exit_func = xa_exit;
0f50fb46 357
ddfbb283
MD
358 kdmsg_iocom_reconnect(&xa->iocom, fp, devname);
359
360 /*
0f50fb46
MD
361 * Setup our LNK_CONN advertisement for autoinitiate.
362 *
363 * Our filter is setup to only accept PEER_BLOCK/SERVER
364 * advertisements.
ddfbb283 365 */
0f50fb46
MD
366 xa->iocom.auto_lnk_conn.pfs_type = DMSG_PFSTYPE_CLIENT;
367 xa->iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1;
368 xa->iocom.auto_lnk_conn.peer_type = DMSG_PEER_BLOCK;
369 xa->iocom.auto_lnk_conn.peer_mask = 1LLU << DMSG_PEER_BLOCK;
370 xa->iocom.auto_lnk_conn.pfs_mask = 1LLU << DMSG_PFSTYPE_SERVER;
371 ksnprintf(xa->iocom.auto_lnk_conn.cl_label,
372 sizeof(xa->iocom.auto_lnk_conn.cl_label),
ddfbb283 373 "%s", xaioc->cl_label);
ddfbb283 374
0f50fb46
MD
375 /*
376 * We need a unique pfs_fsid to avoid confusion.
377 * We supply a rendezvous fs_label using the serial number.
378 */
379 kern_uuidgen(&xa->pfs_fsid, 1);
380 xa->iocom.auto_lnk_conn.pfs_fsid = xa->pfs_fsid;
381 ksnprintf(xa->iocom.auto_lnk_conn.fs_label,
382 sizeof(xa->iocom.auto_lnk_conn.fs_label),
383 "%s", xaioc->fs_label);
ddfbb283 384
0f50fb46
MD
385 /*
386 * Setup our LNK_SPAN advertisement for autoinitiate
387 */
388 xa->iocom.auto_lnk_span.pfs_type = DMSG_PFSTYPE_CLIENT;
389 xa->iocom.auto_lnk_span.proto_version = DMSG_SPAN_PROTO_1;
390 xa->iocom.auto_lnk_span.peer_type = DMSG_PEER_BLOCK;
391 ksnprintf(xa->iocom.auto_lnk_span.cl_label,
392 sizeof(xa->iocom.auto_lnk_span.cl_label),
393 "%s", xa->xaioc.cl_label);
394
395 kdmsg_iocom_autoinitiate(&xa->iocom, xa_autodmsg);
396 disk_setdiskinfo_sync(&xa->disk, &xa->info);
397
398 lwkt_gettoken(&xdisk_token);
399 xa->serializing = 0;
400 xa_terminate_check(xa);
401 lwkt_reltoken(&xdisk_token);
ddfbb283 402
ddfbb283
MD
403 return(0);
404}
405
406static int
0f50fb46 407xdisk_detach(struct xdisk_attach_ioctl *xaioc)
ddfbb283 408{
0f50fb46
MD
409 struct xa_softc *xa;
410
411 lwkt_gettoken(&xdisk_token);
412 for (;;) {
413 TAILQ_FOREACH(xa, &xa_queue, entry) {
414 if (strcmp(xa->iocom.auto_lnk_conn.fs_label,
415 xaioc->fs_label) == 0) {
416 break;
417 }
418 }
419 if (xa == NULL || xa->serializing == 0) {
420 xa->serializing = 1;
421 break;
422 }
423 tsleep(xa, 0, "xadet", hz / 10);
ddfbb283 424 }
0f50fb46 425 if (xa) {
0f50fb46
MD
426 kdmsg_iocom_uninit(&xa->iocom);
427 xa->serializing = 0;
428 }
429 lwkt_reltoken(&xdisk_token);
430 return(0);
ddfbb283
MD
431}
432
433/*
434 * Called from iocom core transmit thread upon disconnect.
435 */
436static
437void
438xa_exit(kdmsg_iocom_t *iocom)
439{
440 struct xa_softc *xa = iocom->handle;
441
8d6d37b8
MD
442 lwkt_gettoken(&xa->tok);
443 lwkt_gettoken(&xdisk_token);
444
445 /*
446 * We must wait for any I/O's to complete to ensure that all
447 * state structure references are cleaned up before returning.
448 */
449 xa->attached = -1; /* force deferral or failure */
450 while (TAILQ_FIRST(&xa->tag_pendq)) {
451 tsleep(xa, 0, "xabiow", hz / 10);
452 }
ddfbb283 453
8d6d37b8
MD
454 /*
455 * All serializing code checks for de-initialization so only
456 * do it if we aren't already serializing.
457 */
458 if (xa->serializing == 0) {
459 xa->serializing = 1;
0f50fb46 460 kdmsg_iocom_uninit(iocom);
8d6d37b8
MD
461 xa->serializing = 0;
462 }
ddfbb283
MD
463
464 /*
0f50fb46
MD
465 * If the drive is not in use and no longer attach it can be
466 * destroyed.
ddfbb283 467 */
0f50fb46
MD
468 xa->attached = 0;
469 xa_terminate_check(xa);
ddfbb283 470 lwkt_reltoken(&xdisk_token);
8d6d37b8 471 lwkt_reltoken(&xa->tok);
0f50fb46 472}
ddfbb283 473
0f50fb46
MD
474/*
475 * Determine if we can destroy the xa_softc.
476 *
477 * Called with xdisk_token held.
478 */
479static
480void
481xa_terminate_check(struct xa_softc *xa)
482{
483 xa_tag_t *tag;
8d6d37b8 484 struct bio *bio;
0f50fb46
MD
485
486 if (xa->opencnt || xa->attached || xa->serializing)
487 return;
488 xa->serializing = 1;
0f50fb46 489 kdmsg_iocom_uninit(&xa->iocom);
8d6d37b8
MD
490
491 /*
492 * When destroying an xa make sure all pending I/O (typically
493 * from the disk probe) is done.
494 *
495 * XXX what about new I/O initiated prior to disk_destroy().
496 */
497 while ((tag = TAILQ_FIRST(&xa->tag_pendq)) != NULL) {
498 TAILQ_REMOVE(&xa->tag_pendq, tag, entry);
499 if ((bio = tag->bio) != NULL) {
500 tag->bio = NULL;
501 bio->bio_buf->b_error = ENXIO;
502 bio->bio_buf->b_flags |= B_ERROR;
503 biodone(bio);
504 }
505 TAILQ_INSERT_TAIL(&xa->tag_freeq, tag, entry);
506 }
0f50fb46
MD
507 if (xa->dev) {
508 disk_destroy(&xa->disk);
509 xa->dev->si_drv1 = NULL;
510 xa->dev = NULL;
511 }
0f50fb46 512 KKASSERT(xa->opencnt == 0 && xa->attached == 0);
0f50fb46
MD
513 while ((tag = TAILQ_FIRST(&xa->tag_freeq)) != NULL) {
514 TAILQ_REMOVE(&xa->tag_freeq, tag, entry);
515 tag->xa = NULL;
516 kfree(tag, M_XDISK);
517 }
518 KKASSERT(TAILQ_EMPTY(&xa->tag_pendq));
519 TAILQ_REMOVE(&xa_queue, xa, entry); /* XXX */
ddfbb283
MD
520 kfree(xa, M_XDISK);
521}
522
0f50fb46
MD
523/*
524 * Shim to catch and record virtual circuit events.
525 */
526static void
527xa_autodmsg(kdmsg_msg_t *msg)
ddfbb283 528{
8d6d37b8
MD
529 xa_softc_t *xa = msg->iocom->handle;
530
0f50fb46
MD
531 kdmsg_circuit_t *circ;
532 kdmsg_circuit_t *cscan;
533 uint32_t xcmd;
534
535 /*
536 * Because this is just a shim we don't have a state callback for
537 * the transactions we are sniffing, so make things easier by
538 * calculating the original command along with the current message's
539 * flags. This is because transactions are made up of numerous
540 * messages and only the first typically specifies the actual command.
541 */
542 if (msg->state) {
543 xcmd = msg->state->icmd |
544 (msg->any.head.cmd & (DMSGF_CREATE |
545 DMSGF_DELETE |
546 DMSGF_REPLY));
547 } else {
548 xcmd = msg->any.head.cmd;
549 }
550
551 /*
552 * Add or remove a circuit, sorted by weight (lower numbers are
553 * better).
554 */
555 switch(xcmd) {
556 case DMSG_LNK_CIRC | DMSGF_CREATE | DMSGF_REPLY:
0c98b966
MD
557 /*
558 * Track established circuits
559 */
0f50fb46
MD
560 circ = msg->state->any.circ;
561 lwkt_gettoken(&xa->tok);
562 if (circ->recorded == 0) {
563 TAILQ_FOREACH(cscan, &xa->circq, entry) {
564 if (circ->weight < cscan->weight)
565 break;
566 }
567 if (cscan)
568 TAILQ_INSERT_BEFORE(cscan, circ, entry);
569 else
570 TAILQ_INSERT_TAIL(&xa->circq, circ, entry);
571 circ->recorded = 1;
572 }
0c98b966
MD
573
574 /*
575 * Restart any deferred I/O.
576 */
577 xa_restart_deferred(xa);
0f50fb46 578 lwkt_reltoken(&xa->tok);
ddfbb283 579 break;
0f50fb46 580 case DMSG_LNK_CIRC | DMSGF_DELETE | DMSGF_REPLY:
8d6d37b8
MD
581 /*
582 * Losing virtual circuit. Scan pending tags.
583 */
0f50fb46
MD
584 circ = msg->state->any.circ;
585 lwkt_gettoken(&xa->tok);
586 if (circ->recorded) {
587 TAILQ_REMOVE(&xa->circq, circ, entry);
588 circ->recorded = 0;
589 }
590 lwkt_reltoken(&xa->tok);
ddfbb283
MD
591 break;
592 default:
593 break;
594 }
ddfbb283
MD
595}
596
597static int
0f50fb46 598xa_rcvdmsg(kdmsg_msg_t *msg)
ddfbb283 599{
0f50fb46 600 switch(msg->any.head.cmd & DMSGF_TRANSMASK) {
ddfbb283
MD
601 case DMSG_DBG_SHELL:
602 /*
3b76886b
MD
603 * Execute shell command (not supported atm).
604 *
605 * This is a one-way packet but if not (e.g. if part of
606 * a streaming transaction), we will have already closed
607 * our end.
ddfbb283
MD
608 */
609 kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
610 break;
611 case DMSG_DBG_SHELL | DMSGF_REPLY:
3b76886b
MD
612 /*
613 * Receive one or more replies to a shell command that we
614 * sent.
615 *
616 * This is a one-way packet but if not (e.g. if part of
617 * a streaming transaction), we will have already closed
618 * our end.
619 */
ddfbb283
MD
620 if (msg->aux_data) {
621 msg->aux_data[msg->aux_size - 1] = 0;
8d6d37b8 622 kprintf("xdisk: DEBUGMSG: %s\n", msg->aux_data);
ddfbb283
MD
623 }
624 break;
625 default:
3b76886b 626 /*
0f50fb46
MD
627 * Unsupported LNK message received. We only need to
628 * reply if it's a transaction in order to close our end.
629 * Ignore any one-way messages are any further messages
630 * associated with the transaction.
631 *
632 * NOTE: This case also includes DMSG_LNK_ERROR messages
633 * which might be one-way, replying to those would
634 * cause an infinite ping-pong.
3b76886b 635 */
0f50fb46
MD
636 if (msg->any.head.cmd & DMSGF_CREATE)
637 kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
ddfbb283
MD
638 break;
639 }
640 return(0);
641}
642
ddfbb283
MD
643
644/************************************************************************
645 * XA DEVICE INTERFACE *
646 ************************************************************************/
647
648static int
649xa_open(struct dev_open_args *ap)
650{
651 cdev_t dev = ap->a_head.a_dev;
0f50fb46
MD
652 xa_softc_t *xa;
653 xa_tag_t *tag;
654 kdmsg_msg_t *msg;
655 int error;
ddfbb283
MD
656
657 dev->si_bsize_phys = 512;
658 dev->si_bsize_best = 32768;
659
660 /*
0f50fb46
MD
661 * Interlock open with opencnt, wait for attachment operations
662 * to finish.
ddfbb283 663 */
0f50fb46
MD
664 lwkt_gettoken(&xdisk_token);
665again:
666 xa = dev->si_drv1;
667 if (xa == NULL) {
668 lwkt_reltoken(&xdisk_token);
669 return ENXIO; /* raced destruction */
670 }
671 if (xa->serializing) {
672 tsleep(xa, 0, "xarace", hz / 10);
673 goto again;
674 }
8d6d37b8
MD
675 if (xa->attached == 0) {
676 lwkt_reltoken(&xdisk_token);
677 return ENXIO; /* raced destruction */
678 }
ddfbb283 679
0f50fb46
MD
680 /*
681 * Serialize initial open
682 */
683 if (xa->opencnt++ > 0) {
684 lwkt_reltoken(&xdisk_token);
685 return(0);
686 }
687 xa->serializing = 1;
688 lwkt_reltoken(&xdisk_token);
ddfbb283 689
0f50fb46
MD
690 tag = xa_setup_cmd(xa, NULL);
691 if (tag == NULL) {
692 lwkt_gettoken(&xdisk_token);
693 KKASSERT(xa->opencnt > 0);
694 --xa->opencnt;
695 xa->serializing = 0;
696 xa_terminate_check(xa);
697 lwkt_reltoken(&xdisk_token);
698 return(ENXIO);
699 }
8d6d37b8 700 msg = kdmsg_msg_alloc(&xa->iocom, tag->circ,
0f50fb46
MD
701 DMSG_BLK_OPEN | DMSGF_CREATE,
702 xa_sync_completion, tag);
703 msg->any.blk_open.modes = DMSG_BLKOPEN_RD | DMSG_BLKOPEN_WR;
704 xa_start(tag, msg);
705 if (xa_wait(tag, 0) == 0) {
0f50fb46
MD
706 xa->keyid = tag->status.keyid;
707 xa->opentag = tag; /* leave tag open */
708 xa->serializing = 0;
709 error = 0;
710 } else {
0f50fb46
MD
711 xa_done(tag, 0);
712 lwkt_gettoken(&xdisk_token);
713 KKASSERT(xa->opencnt > 0);
714 --xa->opencnt;
715 xa->serializing = 0;
716 xa_terminate_check(xa);
717 lwkt_reltoken(&xdisk_token);
718 error = ENXIO;
719 }
720 return (error);
ddfbb283
MD
721}
722
723static int
724xa_close(struct dev_close_args *ap)
725{
726 cdev_t dev = ap->a_head.a_dev;
0f50fb46
MD
727 xa_softc_t *xa;
728 xa_tag_t *tag;
729
730 xa = dev->si_drv1;
731 if (xa == NULL)
732 return ENXIO; /* raced destruction */
733
734 lwkt_gettoken(&xa->tok);
735 if ((tag = xa->opentag) != NULL) {
736 xa->opentag = NULL;
737 kdmsg_state_reply(tag->state, DMSG_ERR_NOSUPP);
738 while (tag->done == 0)
739 xa_wait(tag, tag->waitseq);
740 xa_done(tag, 0);
741 }
742 lwkt_reltoken(&xa->tok);
743
744 lwkt_gettoken(&xdisk_token);
745 KKASSERT(xa->opencnt > 0);
746 --xa->opencnt;
747 xa_terminate_check(xa);
748 lwkt_reltoken(&xdisk_token);
749
750 return(0);
ddfbb283
MD
751}
752
753static int
754xa_strategy(struct dev_strategy_args *ap)
755{
0f50fb46
MD
756 xa_softc_t *xa = ap->a_head.a_dev->si_drv1;
757 xa_tag_t *tag;
758 struct bio *bio = ap->a_bio;
759
8d6d37b8
MD
760 /*
761 * Allow potentially temporary link failures to fail the I/Os
762 * only if the device is not open. That is, we allow the disk
763 * probe code prior to mount to fail.
764 */
765 if (xa->attached == 0 && xa->opencnt == 0) {
766 bio->bio_buf->b_error = ENXIO;
767 bio->bio_buf->b_flags |= B_ERROR;
768 biodone(bio);
769 return(0);
770 }
0f50fb46
MD
771
772 tag = xa_setup_cmd(xa, bio);
773 if (tag)
774 xa_start(tag, NULL);
775 return(0);
ddfbb283
MD
776}
777
778static int
779xa_ioctl(struct dev_ioctl_args *ap)
780{
0f50fb46 781 return(ENOTTY);
ddfbb283
MD
782}
783
784static int
785xa_size(struct dev_psize_args *ap)
786{
787 struct xa_softc *xa;
788
789 if ((xa = ap->a_head.a_dev->si_drv1) == NULL)
790 return (ENXIO);
ddfbb283
MD
791 ap->a_result = xa->info.d_media_blocks;
792 return (0);
793}
0f50fb46
MD
794
795/************************************************************************
796 * XA BLOCK PROTOCOL STATE MACHINE *
797 ************************************************************************
798 *
799 * Implement tag/msg setup and related functions.
800 */
801static xa_tag_t *
802xa_setup_cmd(xa_softc_t *xa, struct bio *bio)
803{
804 kdmsg_circuit_t *circ;
805 xa_tag_t *tag;
806
807 /*
808 * Only get a tag if we have a valid virtual circuit to the server.
809 */
810 lwkt_gettoken(&xa->tok);
8d6d37b8 811 if ((circ = TAILQ_FIRST(&xa->circq)) == NULL || xa->attached <= 0) {
0f50fb46
MD
812 tag = NULL;
813 } else if ((tag = TAILQ_FIRST(&xa->tag_freeq)) != NULL) {
814 TAILQ_REMOVE(&xa->tag_freeq, tag, entry);
0f50fb46 815 tag->bio = bio;
8d6d37b8
MD
816 tag->circ = circ;
817 kdmsg_circ_hold(circ);
818 TAILQ_INSERT_TAIL(&xa->tag_pendq, tag, entry);
0f50fb46
MD
819 }
820
821 /*
822 * If we can't dispatch now and this is a bio, queue it for later.
823 */
824 if (tag == NULL && bio) {
825 TAILQ_INSERT_TAIL(&xa->bioq, bio, bio_act);
826 }
827 lwkt_reltoken(&xa->tok);
828
829 return (tag);
830}
831
832static void
833xa_start(xa_tag_t *tag, kdmsg_msg_t *msg)
834{
835 xa_softc_t *xa = tag->xa;
836
837 if (msg == NULL) {
838 struct bio *bio;
839 struct buf *bp;
840
841 KKASSERT(tag->bio);
842 bio = tag->bio;
843 bp = bio->bio_buf;
844
845 switch(bp->b_cmd) {
846 case BUF_CMD_READ:
8d6d37b8 847 msg = kdmsg_msg_alloc(&xa->iocom, tag->circ,
0f50fb46
MD
848 DMSG_BLK_READ |
849 DMSGF_CREATE | DMSGF_DELETE,
850 xa_bio_completion, tag);
851 msg->any.blk_read.keyid = xa->keyid;
852 msg->any.blk_read.offset = bio->bio_offset;
853 msg->any.blk_read.bytes = bp->b_bcount;
854 break;
855 case BUF_CMD_WRITE:
8d6d37b8 856 msg = kdmsg_msg_alloc(&xa->iocom, tag->circ,
0f50fb46
MD
857 DMSG_BLK_WRITE |
858 DMSGF_CREATE | DMSGF_DELETE,
859 xa_bio_completion, tag);
860 msg->any.blk_write.keyid = xa->keyid;
861 msg->any.blk_write.offset = bio->bio_offset;
862 msg->any.blk_write.bytes = bp->b_bcount;
863 msg->aux_data = bp->b_data;
864 msg->aux_size = bp->b_bcount;
865 break;
866 case BUF_CMD_FLUSH:
8d6d37b8 867 msg = kdmsg_msg_alloc(&xa->iocom, tag->circ,
0f50fb46
MD
868 DMSG_BLK_FLUSH |
869 DMSGF_CREATE | DMSGF_DELETE,
870 xa_bio_completion, tag);
871 msg->any.blk_flush.keyid = xa->keyid;
872 msg->any.blk_flush.offset = bio->bio_offset;
873 msg->any.blk_flush.bytes = bp->b_bcount;
874 break;
875 case BUF_CMD_FREEBLKS:
8d6d37b8 876 msg = kdmsg_msg_alloc(&xa->iocom, tag->circ,
0f50fb46
MD
877 DMSG_BLK_FREEBLKS |
878 DMSGF_CREATE | DMSGF_DELETE,
879 xa_bio_completion, tag);
880 msg->any.blk_freeblks.keyid = xa->keyid;
881 msg->any.blk_freeblks.offset = bio->bio_offset;
882 msg->any.blk_freeblks.bytes = bp->b_bcount;
883 break;
884 default:
885 bp->b_flags |= B_ERROR;
886 bp->b_error = EIO;
887 biodone(bio);
888 tag->bio = NULL;
889 break;
890 }
891 }
892
893 tag->done = 0;
894 tag->waitseq = 0;
895 if (msg) {
896 tag->state = msg->state;
897 kdmsg_msg_write(msg);
898 } else {
899 xa_done(tag, 1);
900 }
901}
902
903static uint32_t
904xa_wait(xa_tag_t *tag, int seq)
905{
906 xa_softc_t *xa = tag->xa;
907
908 lwkt_gettoken(&xa->tok);
909 while (tag->waitseq == seq)
910 tsleep(tag, 0, "xawait", 0);
911 lwkt_reltoken(&xa->tok);
912 return (tag->status.head.error);
913}
914
915static void
916xa_done(xa_tag_t *tag, int wasbio)
917{
918 xa_softc_t *xa = tag->xa;
919 struct bio *bio;
920
0f50fb46
MD
921 KKASSERT(tag->bio == NULL);
922 tag->done = 1;
8d6d37b8 923 tag->state = NULL;
0f50fb46
MD
924
925 lwkt_gettoken(&xa->tok);
926 if ((bio = TAILQ_FIRST(&xa->bioq)) != NULL) {
927 TAILQ_REMOVE(&xa->bioq, bio, bio_act);
928 tag->bio = bio;
929 lwkt_reltoken(&xa->tok);
930 xa_start(tag, NULL);
931 } else {
8d6d37b8
MD
932 if (tag->circ) {
933 kdmsg_circ_drop(tag->circ);
934 tag->circ = NULL;
935 }
936 TAILQ_REMOVE(&xa->tag_pendq, tag, entry);
0f50fb46
MD
937 TAILQ_INSERT_TAIL(&xa->tag_freeq, tag, entry);
938 lwkt_reltoken(&xa->tok);
939 }
940}
941
942static int
943xa_sync_completion(kdmsg_state_t *state, kdmsg_msg_t *msg)
944{
945 xa_tag_t *tag = state->any.any;
946 xa_softc_t *xa = tag->xa;
947
948 switch(msg->any.head.cmd & DMSGF_CMDSWMASK) {
949 case DMSG_LNK_ERROR | DMSGF_REPLY:
950 bzero(&tag->status, sizeof(tag->status));
951 tag->status.head = msg->any.head;
952 break;
953 case DMSG_BLK_ERROR | DMSGF_REPLY:
954 tag->status = msg->any.blk_error;
955 break;
956 }
0f50fb46
MD
957 if (msg->any.head.cmd & DMSGF_DELETE) { /* receive termination */
958 kdmsg_msg_reply(msg, 0); /* terminate our side */
959 tag->done = 1;
960 }
961 lwkt_gettoken(&xa->tok);
962 ++tag->waitseq;
963 lwkt_reltoken(&xa->tok);
964
965 wakeup(tag);
966
967 return (0);
968}
969
970static int
971xa_bio_completion(kdmsg_state_t *state, kdmsg_msg_t *msg)
972{
973 xa_tag_t *tag = state->any.any;
974 /*xa_softc_t *xa = tag->xa;*/
975 struct bio *bio;
976 struct buf *bp;
977
978 /*
979 * Get the bio from the tag. If no bio is present we just do
980 * 'done' handling.
981 */
982 if ((bio = tag->bio) == NULL)
983 goto handle_done;
984 bp = bio->bio_buf;
985
986 /*
987 * Process return status
988 */
989 switch(msg->any.head.cmd & DMSGF_CMDSWMASK) {
990 case DMSG_LNK_ERROR | DMSGF_REPLY:
991 bzero(&tag->status, sizeof(tag->status));
992 tag->status.head = msg->any.head;
993 if (tag->status.head.error)
994 tag->status.resid = bp->b_bcount;
995 else
996 tag->status.resid = 0;
997 break;
998 case DMSG_BLK_ERROR | DMSGF_REPLY:
999 tag->status = msg->any.blk_error;
1000 break;
1001 }
0f50fb46
MD
1002
1003 /*
1004 * Process bio completion
1005 *
1006 * For reads any returned data is zero-extended if necessary, so
1007 * the server can short-cut any all-zeros reads if it desires.
1008 */
1009 switch(bp->b_cmd) {
1010 case BUF_CMD_READ:
1011 if (msg->aux_data && msg->aux_size) {
1012 if (msg->aux_size < bp->b_bcount) {
1013 bcopy(msg->aux_data, bp->b_data, msg->aux_size);
1014 bzero(bp->b_data + msg->aux_size,
1015 bp->b_bcount - msg->aux_size);
1016 } else {
1017 bcopy(msg->aux_data, bp->b_data, bp->b_bcount);
1018 }
1019 } else {
1020 bzero(bp->b_data, bp->b_bcount);
1021 }
1022 /* fall through */
1023 case BUF_CMD_WRITE:
1024 case BUF_CMD_FLUSH:
1025 case BUF_CMD_FREEBLKS:
1026 default:
1027 if (tag->status.resid > bp->b_bcount)
1028 tag->status.resid = bp->b_bcount;
1029 bp->b_resid = tag->status.resid;
1030 if ((bp->b_error = tag->status.head.error) != 0) {
1031 bp->b_flags |= B_ERROR;
1032 } else {
1033 bp->b_resid = 0;
1034 }
1035 biodone(bio);
1036 tag->bio = NULL;
1037 break;
1038 }
1039
1040 /*
1041 * Handle completion of the transaction. If the bioq is not empty
1042 * we can initiate another bio on the same tag.
8d6d37b8
MD
1043 *
1044 * NOTE: Most of our transactions will be single-message
1045 * CREATE+DELETEs, so we won't have to terminate the
1046 * transaction separately, here. But just in case they
1047 * aren't be sure to terminate the transaction.
0f50fb46
MD
1048 */
1049handle_done:
8d6d37b8 1050 if (msg->any.head.cmd & DMSGF_DELETE) {
0f50fb46 1051 xa_done(tag, 1);
8d6d37b8
MD
1052 if ((state->txcmd & DMSGF_DELETE) == 0) {
1053 kdmsg_msg_reply(msg, 0);
1054 }
1055 }
0f50fb46
MD
1056 return (0);
1057}
0c98b966
MD
1058
1059/*
1060 * Restart as much deferred I/O as we can.
1061 *
1062 * Called with xa->tok held
1063 */
1064static
1065void
1066xa_restart_deferred(xa_softc_t *xa)
1067{
1068 struct bio *bio;
1069 xa_tag_t *tag;
1070
1071 while ((bio = TAILQ_FIRST(&xa->bioq)) != NULL) {
1072 tag = xa_setup_cmd(xa, NULL);
1073 if (tag == NULL)
1074 break;
0c98b966
MD
1075 TAILQ_REMOVE(&xa->bioq, bio, bio_act);
1076 tag->bio = bio;
1077 xa_start(tag, NULL);
1078 }
1079}