2 * Copyright (c) 2012 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
41 #include <sys/disklabel.h>
42 #include <sys/disklabel32.h>
43 #include <sys/disklabel64.h>
44 #include <sys/diskslice.h>
45 #include <sys/diskmbr.h>
47 #include <sys/malloc.h>
48 #include <sys/device.h>
49 #include <sys/devfs.h>
50 #include <sys/thread.h>
51 #include <sys/queue.h>
58 #include <sys/msgport2.h>
71 static MALLOC_DEFINE(M_DMSG_DISK, "dmsg_disk", "disk dmsg");
73 static int blk_active;
74 SYSCTL_INT(_debug, OID_AUTO, blk_active, CTLFLAG_RW, &blk_active, 0,
75 "Number of active iocom IOs");
77 static int disk_iocom_reconnect(struct disk *dp, struct file *fp);
78 static int disk_rcvdmsg(kdmsg_msg_t *msg);
80 static void disk_blk_open(struct disk *dp, kdmsg_msg_t *msg);
81 static void disk_blk_read(struct disk *dp, kdmsg_msg_t *msg);
82 static void disk_blk_write(struct disk *dp, kdmsg_msg_t *msg);
83 static void disk_blk_flush(struct disk *dp, kdmsg_msg_t *msg);
84 static void disk_blk_freeblks(struct disk *dp, kdmsg_msg_t *msg);
85 static void diskiodone(struct bio *bio);
88 disk_iocom_init(struct disk *dp)
90 kdmsg_iocom_init(&dp->d_iocom, dp,
91 KDMSG_IOCOMF_AUTOCONN |
92 KDMSG_IOCOMF_AUTORXSPAN |
93 KDMSG_IOCOMF_AUTOTXSPAN,
94 M_DMSG_DISK, disk_rcvdmsg);
98 disk_iocom_update(struct disk *dp)
103 disk_iocom_uninit(struct disk *dp)
105 kdmsg_iocom_uninit(&dp->d_iocom);
109 disk_iocom_ioctl(struct disk *dp, u_long cmd, void *data)
112 struct disk_ioc_recluster *recl;
118 fp = holdfp(curthread, recl->fd, -1);
120 error = disk_iocom_reconnect(dp, fp);
134 disk_iocom_reconnect(struct disk *dp, struct file *fp)
138 ksnprintf(devname, sizeof(devname), "%s%d",
139 dev_dname(dp->d_rawdev), dkunit(dp->d_rawdev));
141 kdmsg_iocom_reconnect(&dp->d_iocom, fp, devname);
143 dp->d_iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1;
144 dp->d_iocom.auto_lnk_conn.peer_type = DMSG_PEER_BLOCK;
145 dp->d_iocom.auto_lnk_conn.peer_mask = 1LLU << DMSG_PEER_BLOCK;
146 dp->d_iocom.auto_lnk_conn.peer_mask = (uint64_t)-1;
148 if (dp->d_info.d_serialno) {
149 ksnprintf(dp->d_iocom.auto_lnk_conn.peer_label,
150 sizeof(dp->d_iocom.auto_lnk_conn.peer_label),
151 "%s/%s", hostname, dp->d_info.d_serialno);
153 ksnprintf(dp->d_iocom.auto_lnk_conn.peer_label,
154 sizeof(dp->d_iocom.auto_lnk_conn.peer_label),
155 "%s/%s", hostname, devname);
158 ksnprintf(dp->d_iocom.auto_lnk_conn.peer_label,
159 sizeof(dp->d_iocom.auto_lnk_conn.peer_label),
160 "%s/%s", hostname, devname);
162 dp->d_iocom.auto_lnk_span.proto_version = DMSG_SPAN_PROTO_1;
163 dp->d_iocom.auto_lnk_span.peer_type = DMSG_PEER_BLOCK;
164 dp->d_iocom.auto_lnk_span.media.block.bytes =
165 dp->d_info.d_media_size;
166 dp->d_iocom.auto_lnk_span.media.block.blksize =
167 dp->d_info.d_media_blksize;
168 ksnprintf(dp->d_iocom.auto_lnk_span.peer_label,
169 sizeof(dp->d_iocom.auto_lnk_span.peer_label),
170 "%s", dp->d_iocom.auto_lnk_conn.peer_label);
171 if (dp->d_info.d_serialno) {
172 ksnprintf(dp->d_iocom.auto_lnk_span.pfs_label,
173 sizeof(dp->d_iocom.auto_lnk_span.pfs_label),
174 "%s", dp->d_info.d_serialno);
177 * If no serial number is available generate a dummy serial
178 * number from the host and device name and pray. This will
179 * allow e.g. /dev/vn* to look meaningful on a remote machine.
181 ksnprintf(dp->d_iocom.auto_lnk_span.pfs_label,
182 sizeof(dp->d_iocom.auto_lnk_span.pfs_label),
183 "%s.%s", hostname, devname);
186 kdmsg_iocom_autoinitiate(&dp->d_iocom, NULL);
192 disk_rcvdmsg(kdmsg_msg_t *msg)
194 struct disk *dp = msg->state->iocom->handle;
197 * Handle debug messages (these might not be in transactions)
199 switch(msg->any.head.cmd & DMSGF_CMDSWMASK) {
202 * Execute shell command (not supported atm)
204 kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
206 case DMSG_DBG_SHELL | DMSGF_REPLY:
208 msg->aux_data[msg->aux_size - 1] = 0;
209 kprintf("diskiocom: DEBUGMSG: %s\n", msg->aux_data);
215 * All remaining messages must be in a transaction.
217 * NOTE! We currently don't care if the transaction is just
218 * the span transaction (for disk probes) or if it is the
219 * BLK_OPEN transaction.
221 * NOTE! We are switching on the first message's command. The
222 * actual message command within the transaction may be
223 * different (if streaming within a transaction).
225 if (msg->state == &msg->state->iocom->state0) {
226 kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
230 switch(msg->state->rxcmd & DMSGF_CMDSWMASK) {
232 disk_blk_open(dp, msg);
236 * not reached normally but leave in for completeness
238 disk_blk_read(dp, msg);
241 disk_blk_write(dp, msg);
244 disk_blk_flush(dp, msg);
246 case DMSG_BLK_FREEBLKS:
247 disk_blk_freeblks(dp, msg);
250 if ((msg->any.head.cmd & DMSGF_REPLY) == 0) {
251 if (msg->any.head.cmd & DMSGF_DELETE)
252 kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
254 kdmsg_msg_result(msg, DMSG_ERR_NOSUPP);
263 disk_blk_open(struct disk *dp, kdmsg_msg_t *msg)
265 struct dios_open *openst;
266 int error = DMSG_ERR_NOSUPP;
269 openst = msg->state->any.any;
270 if ((msg->any.head.cmd & DMSGF_CMDSWMASK) == DMSG_BLK_OPEN) {
271 if (openst == NULL) {
272 openst = kmalloc(sizeof(*openst), M_DEVBUF,
274 msg->state->any.any = openst;
277 if (msg->any.blk_open.modes & DMSG_BLKOPEN_RD)
279 if (msg->any.blk_open.modes & DMSG_BLKOPEN_WR)
281 error = dev_dopen(dp->d_rawdev, fflags, S_IFCHR, proc0.p_ucred, NULL, NULL);
285 if (msg->any.blk_open.modes & DMSG_BLKOPEN_RD)
287 if (msg->any.blk_open.modes & DMSG_BLKOPEN_WR)
292 if ((msg->any.head.cmd & DMSGF_CMDSWMASK) == DMSG_BLK_CLOSE &&
295 if ((msg->any.blk_open.modes & DMSG_BLKOPEN_RD) &&
299 if ((msg->any.blk_open.modes & DMSG_BLKOPEN_WR) &&
303 error = dev_dclose(dp->d_rawdev, fflags, S_IFCHR, NULL);
307 if (msg->any.blk_open.modes & DMSG_BLKOPEN_RD)
309 if (msg->any.blk_open.modes & DMSG_BLKOPEN_WR)
314 if (msg->any.head.cmd & DMSGF_DELETE) {
316 while (openst->openrd && openst->openwr) {
319 dev_dclose(dp->d_rawdev, FREAD|FWRITE, S_IFCHR, NULL);
321 while (openst->openrd) {
323 dev_dclose(dp->d_rawdev, FREAD, S_IFCHR, NULL);
325 while (openst->openwr) {
327 dev_dclose(dp->d_rawdev, FWRITE, S_IFCHR, NULL);
329 kfree(openst, M_DEVBUF);
330 msg->state->any.any = NULL;
332 kdmsg_msg_reply(msg, error);
334 kdmsg_msg_result(msg, error);
340 disk_blk_read(struct disk *dp, kdmsg_msg_t *msg)
342 struct dios_io *iost;
345 int error = DMSG_ERR_NOSUPP;
349 * Only DMSG_BLK_READ commands imply read ops.
351 iost = msg->state->any.any;
352 if ((msg->any.head.cmd & DMSGF_CMDSWMASK) == DMSG_BLK_READ) {
353 if (msg->any.blk_read.bytes < DEV_BSIZE ||
354 msg->any.blk_read.bytes > MAXPHYS) {
355 error = DMSG_ERR_PARAM;
359 iost = kmalloc(sizeof(*iost), M_DEVBUF,
361 msg->state->any.any = iost;
364 bp = getpbuf_mem(NULL);
365 KKASSERT(msg->any.blk_read.bytes <= bp->b_bufsize);
367 bp->b_cmd = BUF_CMD_READ;
368 bp->b_bcount = msg->any.blk_read.bytes;
369 bp->b_resid = bp->b_bcount;
370 bio->bio_offset = msg->any.blk_read.offset;
371 bio->bio_caller_info1.ptr = msg->state;
372 bio->bio_done = diskiodone;
374 /* kdmsg_state_hold(msg->state); */
375 atomic_add_int(&blk_active, 1);
376 atomic_add_int(&iost->count, 1);
377 if (msg->any.head.cmd & DMSGF_DELETE)
380 dev_dstrategy(dp->d_rawdev, bio);
384 if (msg->any.head.cmd & DMSGF_DELETE) {
385 if (iost && iost->count == 0) {
386 kfree(iost, M_DEVBUF);
387 msg->state->any.any = NULL;
389 kdmsg_msg_reply(msg, error);
391 kdmsg_msg_result(msg, error);
398 disk_blk_write(struct disk *dp, kdmsg_msg_t *msg)
400 struct dios_io *iost;
403 int error = DMSG_ERR_NOSUPP;
407 * Only DMSG_BLK_WRITE commands imply read ops.
409 iost = msg->state->any.any;
410 if ((msg->any.head.cmd & DMSGF_CMDSWMASK) == DMSG_BLK_WRITE) {
411 if (msg->any.blk_write.bytes < DEV_BSIZE ||
412 msg->any.blk_write.bytes > MAXPHYS) {
413 error = DMSG_ERR_PARAM;
417 iost = kmalloc(sizeof(*iost), M_DEVBUF,
419 msg->state->any.any = iost;
423 * Issue WRITE. Short data implies zeros. Try to optimize
424 * the buffer cache buffer for the case where we can just
425 * use the message's data pointer.
428 if (msg->aux_size >= msg->any.blk_write.bytes)
431 bp = getpbuf_mem(NULL);
432 KKASSERT(msg->any.blk_write.bytes <= bp->b_bufsize);
434 bp->b_cmd = BUF_CMD_WRITE;
435 bp->b_bcount = msg->any.blk_write.bytes;
436 bp->b_resid = bp->b_bcount;
437 if (msg->aux_size >= msg->any.blk_write.bytes) {
438 bp->b_data = msg->aux_data;
439 kdmsg_detach_aux_data(msg, &iost->data);
441 bcopy(msg->aux_data, bp->b_data, msg->aux_size);
442 bzero(bp->b_data + msg->aux_size,
443 msg->any.blk_write.bytes - msg->aux_size);
444 bzero(&iost->data, sizeof(iost->data));
446 bio->bio_offset = msg->any.blk_write.offset;
447 bio->bio_caller_info1.ptr = msg->state;
448 bio->bio_done = diskiodone;
450 /* kdmsg_state_hold(msg->state); */
451 atomic_add_int(&blk_active, 1);
452 atomic_add_int(&iost->count, 1);
453 if (msg->any.head.cmd & DMSGF_DELETE)
456 dev_dstrategy(dp->d_rawdev, bio);
460 if (msg->any.head.cmd & DMSGF_DELETE) {
461 if (iost && iost->count == 0) {
462 kfree(iost, M_DEVBUF);
463 msg->state->any.any = NULL;
465 kdmsg_msg_reply(msg, error);
467 kdmsg_msg_result(msg, error);
474 disk_blk_flush(struct disk *dp, kdmsg_msg_t *msg)
476 struct dios_io *iost;
479 int error = DMSG_ERR_NOSUPP;
483 * Only DMSG_BLK_FLUSH commands imply read ops.
485 iost = msg->state->any.any;
486 if ((msg->any.head.cmd & DMSGF_CMDSWMASK) == DMSG_BLK_FLUSH) {
488 iost = kmalloc(sizeof(*iost), M_DEVBUF,
490 msg->state->any.any = iost;
495 bp->b_cmd = BUF_CMD_FLUSH;
496 bp->b_bcount = msg->any.blk_flush.bytes;
498 bio->bio_offset = msg->any.blk_flush.offset;
499 bio->bio_caller_info1.ptr = msg->state;
500 bio->bio_done = diskiodone;
502 /* kdmsg_state_hold(msg->state); */
503 atomic_add_int(&blk_active, 1);
504 atomic_add_int(&iost->count, 1);
505 if (msg->any.head.cmd & DMSGF_DELETE)
508 dev_dstrategy(dp->d_rawdev, bio);
511 if (msg->any.head.cmd & DMSGF_DELETE) {
512 if (iost && iost->count == 0) {
513 kfree(iost, M_DEVBUF);
514 msg->state->any.any = NULL;
516 kdmsg_msg_reply(msg, error);
518 kdmsg_msg_result(msg, error);
525 disk_blk_freeblks(struct disk *dp, kdmsg_msg_t *msg)
527 struct dios_io *iost;
530 int error = DMSG_ERR_NOSUPP;
534 * Only DMSG_BLK_FREEBLKS commands imply read ops.
536 iost = msg->state->any.any;
537 if ((msg->any.head.cmd & DMSGF_CMDSWMASK) == DMSG_BLK_FREEBLKS) {
539 iost = kmalloc(sizeof(*iost), M_DEVBUF,
541 msg->state->any.any = iost;
546 bp->b_cmd = BUF_CMD_FREEBLKS;
547 bp->b_bcount = msg->any.blk_freeblks.bytes;
549 bio->bio_offset = msg->any.blk_freeblks.offset;
550 bio->bio_caller_info1.ptr = msg->state;
551 bio->bio_done = diskiodone;
553 /* kdmsg_state_hold(msg->state); */
554 atomic_add_int(&blk_active, 1);
555 atomic_add_int(&iost->count, 1);
556 if (msg->any.head.cmd & DMSGF_DELETE)
559 dev_dstrategy(dp->d_rawdev, bio);
562 if (msg->any.head.cmd & DMSGF_DELETE) {
563 if (iost && iost->count == 0) {
564 kfree(iost, M_DEVBUF);
565 msg->state->any.any = NULL;
567 kdmsg_msg_reply(msg, error);
569 kdmsg_msg_result(msg, error);
576 diskiodone(struct bio *bio)
578 struct buf *bp = bio->bio_buf;
579 kdmsg_state_t *state = bio->bio_caller_info1.ptr;
581 struct dios_io *iost = state->any.any;
588 cmd = DMSG_LNK_ERROR;
594 cmd = DMSG_LNK_ERROR;
596 bytes = bp->b_bcount;
599 if (bp->b_flags & B_ERROR) {
605 kdmsg_free_aux_data(&iost->data);
608 case BUF_CMD_FREEBLKS:
609 if (bp->b_flags & B_ERROR)
615 panic("diskiodone: Unknown bio cmd = %d\n",
616 bio->bio_buf->b_cmd);
617 error = 0; /* avoid compiler warning */
618 break; /* NOT REACHED */
622 * Convert error to DMSG_ERR_* code.
628 * Convert LNK_ERROR or BLK_ERROR if non-zero resid. READS will
629 * have already converted cmd to BLK_ERROR and set up data to return.
631 if (resid && cmd == DMSG_LNK_ERROR)
632 cmd = DMSG_BLK_ERROR;
633 /* XXX txcmd is delayed so this won't work for streaming */
634 if ((state->txcmd & DMSGF_CREATE) == 0) /* assume serialized */
637 if (atomic_fetchadd_int(&iost->count, -1) == 1)
640 atomic_add_int(&iost->count, -1);
642 atomic_add_int(&blk_active, -1);
646 * Allocate a basic or extended reply. Be careful not to populate
647 * extended header fields unless we allocated an extended reply.
649 rmsg = kdmsg_msg_alloc(state, cmd, NULL, 0);
651 rmsg->aux_data = kmalloc(bytes, state->iocom->mmsg, M_INTWAIT);
652 rmsg->aux_size = bytes;
653 rmsg->flags |= KDMSG_FLAG_AUXALLOC;
654 bcopy(data, rmsg->aux_data, bytes);
656 rmsg->any.blk_error.head.error = error;
657 if ((cmd & DMSGF_BASECMDMASK) == DMSG_BLK_ERROR)
658 rmsg->any.blk_error.resid = resid;
659 bio->bio_caller_info1.ptr = NULL;
660 /* kdmsg_state_drop(state); */
661 kdmsg_msg_write(rmsg);
662 if (bp->b_flags & B_PAGING) {
665 bp->b_flags |= B_INVAL | B_AGE;