2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * Copyright (c) 1995 Jason R. Thorpe.
37 * All rights reserved.
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. All advertising materials mentioning features or use of this software
48 * must display the following acknowledgement:
49 * This product includes software developed for the NetBSD Project
51 * 4. The name of the author may not be used to endorse or promote products
52 * derived from this software without specific prior written permission.
54 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
55 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
56 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
57 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
58 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
59 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
60 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
61 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
62 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * Copyright (c) 1988 University of Utah.
69 * Copyright (c) 1990, 1993
70 * The Regents of the University of California. All rights reserved.
72 * This code is derived from software contributed to Berkeley by
73 * the Systems Programming Group of the University of Utah Computer
76 * Redistribution and use in source and binary forms, with or without
77 * modification, are permitted provided that the following conditions
79 * 1. Redistributions of source code must retain the above copyright
80 * notice, this list of conditions and the following disclaimer.
81 * 2. Redistributions in binary form must reproduce the above copyright
82 * notice, this list of conditions and the following disclaimer in the
83 * documentation and/or other materials provided with the distribution.
84 * 3. All advertising materials mentioning features or use of this software
85 * must display the following acknowledgement:
86 * This product includes software developed by the University of
87 * California, Berkeley and its contributors.
88 * 4. Neither the name of the University nor the names of its contributors
89 * may be used to endorse or promote products derived from this software
90 * without specific prior written permission.
92 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
93 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
95 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
96 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
97 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
98 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
99 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
100 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
101 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
104 * from: Utah $Hdr: cd.c 1.6 90/11/28$
107 * @(#)cd.c 8.2 (Berkeley) 11/16/93
108 * $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $
109 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
113 * "Concatenated" disk driver.
115 * Original dynamic configuration support by:
116 * Jason R. Thorpe <thorpej@nas.nasa.gov>
117 * Numerical Aerodynamic Simulation Facility
119 * NASA Ames Research Center
120 * Moffett Field, CA 94035
125 #include <sys/param.h>
126 #include <sys/systm.h>
127 #include <sys/kernel.h>
128 #include <sys/module.h>
129 #include <sys/proc.h>
131 #include <sys/malloc.h>
132 #include <sys/nlookup.h>
133 #include <sys/conf.h>
134 #include <sys/stat.h>
135 #include <sys/sysctl.h>
136 #include <sys/disk.h>
137 #include <sys/dtype.h>
138 #include <sys/diskslice.h>
139 #include <sys/devicestat.h>
140 #include <sys/fcntl.h>
141 #include <sys/vnode.h>
142 #include <sys/ccdvar.h>
144 #include <vm/vm_zone.h>
146 #include <vfs/ufs/dinode.h> /* XXX Used only for fs.h */
147 #include <vfs/ufs/fs.h> /* XXX used only to get BBSIZE and SBSIZE */
149 #include <sys/thread2.h>
150 #include <sys/buf2.h>
152 #if defined(CCDDEBUG) && !defined(DEBUG)
157 #define CCDB_FOLLOW 0x01
158 #define CCDB_INIT 0x02
160 #define CCDB_LABEL 0x08
161 #define CCDB_VNODE 0x10
162 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
164 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
168 #define ccdunit(x) dkunit(x)
169 #define ccdpart(x) dkpart(x)
172 This is how mirroring works (only writes are special):
174 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
175 linked together by the cb_mirror field. "cb_pflags &
176 CCDPF_MIRROR_DONE" is set to 0 on both of them.
178 When a component returns to ccdiodone(), it checks if "cb_pflags &
179 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
180 flag and returns. If it is, it means its partner has already
181 returned, so it will go to the regular cleanup.
186 struct buf cb_buf; /* new I/O buf */
187 struct vnode *cb_vp; /* related vnode */
188 struct bio *cb_obio; /* ptr. to original I/O buf */
189 int cb_unit; /* target unit */
190 int cb_comp; /* target component */
191 int cb_pflags; /* mirror/parity status flag */
192 struct ccdbuf *cb_mirror; /* mirror counterpart */
195 /* bits in cb_pflags */
196 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
198 static d_open_t ccdopen;
199 static d_close_t ccdclose;
200 static d_strategy_t ccdstrategy;
201 static d_ioctl_t ccdioctl;
202 static d_dump_t ccddump;
204 static struct dev_ops ccd_ops = {
205 { "ccd", 0, D_DISK | D_MPSAFE },
209 .d_write = physwrite,
211 .d_strategy = ccdstrategy,
215 /* called during module initialization */
216 static void ccdattach (void);
217 static int ccddetach (void);
218 static int ccd_modevent (module_t, int, void *);
220 /* called by biodone() at interrupt time */
221 static void ccdiodone (struct bio *bio);
223 static void ccdstart (struct ccd_softc *, struct bio *);
224 static void ccdinterleave (struct ccd_softc *, int);
225 static void ccdintr (struct ccd_softc *, struct bio *);
226 static int ccdinit (struct ccddevice *, char **, struct ucred *);
227 static int ccdlookup (char *, struct vnode **);
228 static void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
229 struct bio *, off_t, caddr_t, long);
230 static int ccdlock (struct ccd_softc *);
231 static void ccdunlock (struct ccd_softc *);
234 static void printiinfo (struct ccdiinfo *);
237 /* Non-private for the benefit of libkvm. */
238 struct ccd_softc *ccd_softc;
239 struct ccddevice *ccddevs;
240 static int numccd = 0;
243 * getccdbuf() - Allocate and zero a ccd buffer.
245 static struct ccdbuf *
250 cbp = kmalloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK | M_ZERO);
251 initbufbio(&cbp->cb_buf);
254 * independant struct buf initialization
256 buf_dep_init(&cbp->cb_buf);
257 BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
258 BUF_KERNPROC(&cbp->cb_buf);
259 cbp->cb_buf.b_flags = B_PAGING | B_BNOCLIP;
265 * putccdbuf() - Free a ccd buffer.
268 putccdbuf(struct ccdbuf *cbp)
270 BUF_UNLOCK(&cbp->cb_buf);
272 uninitbufbio(&cbp->cb_buf);
273 kfree(cbp, M_DEVBUF);
277 * Called by main() during pseudo-device attachment. All we need
278 * to do is allocate enough space for devices to be configured later, and
284 struct disk_info info;
285 struct ccd_softc *cs;
290 kprintf("ccd0-%d: Concatenated disk drivers\n", num-1);
292 kprintf("ccd0: Concatenated disk driver\n");
294 ccd_softc = kmalloc(num * sizeof(struct ccd_softc), M_DEVBUF,
296 ccddevs = kmalloc(num * sizeof(struct ccddevice), M_DEVBUF,
301 * With normal disk devices the open simply fails if the media
302 * is not present. With CCD we have to be able to open the
303 * raw disk to use the ioctl's to set it up, so create a dummy
304 * disk info structure so dscheck() doesn't blow up.
306 bzero(&info, sizeof(info));
307 info.d_media_blksize = DEV_BSIZE;
309 for (i = 0; i < numccd; ++i) {
311 cs->sc_dev = disk_create(i, &cs->sc_disk, &ccd_ops);
312 cs->sc_dev->si_drv1 = cs;
313 cs->sc_dev->si_iosize_max = 256 * 512; /* XXX */
314 disk_setdiskinfo(&cs->sc_disk, &info);
321 struct ccd_softc *cs;
322 struct dev_ioctl_args ioctl_args;
327 bzero(&ioctl_args, sizeof(ioctl_args));
329 for (i = 0; i < numccd; ++i) {
331 if (cs->sc_dev == NULL)
333 ioctl_args.a_head.a_dev = cs->sc_dev;
334 ioctl_args.a_cmd = CCDIOCCLR;
335 ioctl_args.a_fflag = FWRITE;
336 eval = ccdioctl(&ioctl_args);
337 if (eval && eval != ENXIO) {
338 kprintf("ccd%d: In use, cannot detach\n", i);
343 for (i = 0; i < numccd; ++i) {
345 if (cs->sc_dev == NULL)
347 disk_destroy(&cs->sc_disk);
351 kfree(ccd_softc, M_DEVBUF);
353 kfree(ccddevs, M_DEVBUF);
359 ccd_modevent(module_t mod, int type, void *data)
372 default: /* MOD_SHUTDOWN etc */
378 DEV_MODULE(ccd, ccd_modevent, NULL);
381 ccdinit(struct ccddevice *ccd, char **cpaths, struct ucred *cred)
383 struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
384 struct ccdcinfo *ci = NULL; /* XXX */
391 struct partinfo dpart;
392 struct ccdgeom *ccg = &cs->sc_geom;
393 char tmppath[MAXPATHLEN];
397 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
398 kprintf("ccdinit: unit %d\n", ccd->ccd_unit);
402 cs->sc_ileave = ccd->ccd_interleave;
403 cs->sc_nccdisks = ccd->ccd_ndev;
405 /* Allocate space for the component info. */
406 cs->sc_cinfo = kmalloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
408 cs->sc_maxiosize = MAXPHYS;
410 lockinit(&cs->sc_lock, "ccdlck", 0, 0);
414 * Verify that each component piece exists and record
415 * relevant information about it.
419 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
420 vp = ccd->ccd_vpp[ix];
421 ci = &cs->sc_cinfo[ix];
425 * Copy in the pathname of the component.
427 bzero(tmppath, sizeof(tmppath)); /* sanity */
428 if ((error = copyinstr(cpaths[ix], tmppath,
429 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
431 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
432 kprintf("ccd%d: can't copy path, error = %d\n",
433 ccd->ccd_unit, error);
437 ci->ci_path = kmalloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
438 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
440 ci->ci_dev = vn_todev(vp);
441 if (ci->ci_dev->si_iosize_max &&
442 cs->sc_maxiosize > ci->ci_dev->si_iosize_max) {
443 cs->sc_maxiosize = ci->ci_dev->si_iosize_max;
447 * Get partition information for the component.
449 error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart, FREAD,
453 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
454 kprintf("ccd%d: %s: ioctl failed, error = %d\n",
455 ccd->ccd_unit, ci->ci_path, error);
459 if (dpart.fstype != FS_CCD &&
460 !kuuid_is_ccd(&dpart.fstype_uuid)) {
461 kprintf("ccd%d: %s: filesystem type must be 'ccd'\n",
462 ccd->ccd_unit, ci->ci_path);
466 if (maxsecsize < dpart.media_blksize)
467 maxsecsize = dpart.media_blksize;
470 * Skip a certain amount of storage at the beginning of
471 * the component to make sure we don't infringe on any
472 * reserved sectors. This is handled entirely by
473 * dpart.reserved_blocks but we also impose a minimum
474 * of 16 sectors for backwards compatibility.
477 if (skip < dpart.reserved_blocks)
478 skip = dpart.reserved_blocks;
479 size = dpart.media_blocks - skip;
482 * Calculate the size, truncating to an interleave
483 * boundary if necessary.
485 if (cs->sc_ileave > 1)
486 size -= size % cs->sc_ileave;
488 if ((int64_t)size <= 0) {
490 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
491 kprintf("ccd%d: %s: size == 0\n",
492 ccd->ccd_unit, ci->ci_path);
499 * Calculate the smallest uniform component, used
502 if (minsize == 0 || minsize > size)
508 kprintf("ccd%d: max component iosize is %d total blocks %lld\n",
509 cs->sc_unit, cs->sc_maxiosize, (long long)cs->sc_size);
512 * Don't allow the interleave to be smaller than
513 * the biggest component sector.
515 if ((cs->sc_ileave > 0) &&
516 (cs->sc_ileave % (maxsecsize / DEV_BSIZE))) {
518 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
519 kprintf("ccd%d: interleave must be at least %d\n",
520 ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
527 * If uniform interleave is desired set all sizes to that of
528 * the smallest component. This will guarentee that a single
529 * interleave table is generated.
531 * Lost space must be taken into account when calculating the
532 * overall size. Half the space is lost when CCDF_MIRROR is
533 * specified. One disk is lost when CCDF_PARITY is specified.
535 if (ccd->ccd_flags & CCDF_UNIFORM) {
536 for (ci = cs->sc_cinfo;
537 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
538 ci->ci_size = minsize;
540 if (ccd->ccd_flags & CCDF_MIRROR) {
542 * Check to see if an even number of components
543 * have been specified. The interleave must also
544 * be non-zero in order for us to be able to
545 * guarentee the topology.
547 if (cs->sc_nccdisks % 2) {
548 kprintf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
552 if (cs->sc_ileave == 0) {
553 kprintf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
557 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
558 } else if (ccd->ccd_flags & CCDF_PARITY) {
559 cs->sc_size = (cs->sc_nccdisks-1) * minsize;
561 if (cs->sc_ileave == 0) {
562 kprintf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
566 cs->sc_size = cs->sc_nccdisks * minsize;
571 * Construct the interleave table.
573 ccdinterleave(cs, ccd->ccd_unit);
576 * Create pseudo-geometry based on 1MB cylinders. It's
579 ccg->ccg_secsize = maxsecsize;
580 ccg->ccg_ntracks = 1;
581 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
582 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
585 * Add an devstat entry for this device.
587 devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
588 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
589 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
590 DEVSTAT_PRIORITY_ARRAY);
592 cs->sc_flags |= CCDF_INITED;
593 cs->sc_cflags = ccd->ccd_flags; /* So we can find out later... */
594 cs->sc_unit = ccd->ccd_unit;
597 while (ci > cs->sc_cinfo) {
599 kfree(ci->ci_path, M_DEVBUF);
601 kfree(cs->sc_cinfo, M_DEVBUF);
607 ccdinterleave(struct ccd_softc *cs, int unit)
609 struct ccdcinfo *ci, *smallci;
618 if (ccddebug & CCDB_INIT)
619 kprintf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
623 * Allocate an interleave table. The worst case occurs when each
624 * of N disks is of a different size, resulting in N interleave
627 * Chances are this is too big, but we don't care.
629 icount = cs->sc_nccdisks + 1;
630 cs->sc_itable = kmalloc(icount * sizeof(struct ccdiinfo),
631 M_DEVBUF, M_WAITOK|M_ZERO);
634 * Trivial case: no interleave (actually interleave of disk size).
635 * Each table entry represents a single component in its entirety.
637 * An interleave of 0 may not be used with a mirror or parity setup.
639 if (cs->sc_ileave == 0) {
643 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
644 /* Allocate space for ii_index. */
645 ii->ii_index = kmalloc(sizeof(int), M_DEVBUF, M_WAITOK);
647 ii->ii_startblk = bn;
649 ii->ii_index[0] = ix;
650 bn += cs->sc_cinfo[ix].ci_size;
655 if (ccddebug & CCDB_INIT)
656 printiinfo(cs->sc_itable);
662 * The following isn't fast or pretty; it doesn't have to be.
666 for (ii = cs->sc_itable; ii < &cs->sc_itable[icount]; ++ii) {
668 * Allocate space for ii_index. We might allocate more then
671 ii->ii_index = kmalloc((sizeof(int) * cs->sc_nccdisks),
675 * Locate the smallest of the remaining components
679 while (ci < &cs->sc_cinfo[cs->sc_nccdisks]) {
680 if (ci->ci_size > size &&
682 ci->ci_size < smallci->ci_size)) {
689 * Nobody left, all done
691 if (smallci == NULL) {
697 * Record starting logical block using an sc_ileave blocksize.
699 ii->ii_startblk = bn / cs->sc_ileave;
702 * Record starting component block using an sc_ileave
703 * blocksize. This value is relative to the beginning of
706 ii->ii_startoff = lbn;
709 * Determine how many disks take part in this interleave
710 * and record their indices.
713 for (ci = cs->sc_cinfo;
714 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
715 if (ci->ci_size >= smallci->ci_size) {
716 ii->ii_index[ix++] = ci - cs->sc_cinfo;
724 bn += ix * (smallci->ci_size - size);
725 lbn = smallci->ci_size / cs->sc_ileave;
726 size = smallci->ci_size;
728 if (ii == &cs->sc_itable[icount])
729 panic("ccdinterlave software bug! table exhausted");
731 if (ccddebug & CCDB_INIT)
732 printiinfo(cs->sc_itable);
738 ccdopen(struct dev_open_args *ap)
740 cdev_t dev = ap->a_head.a_dev;
741 int unit = ccdunit(dev);
742 struct ccd_softc *cs;
746 if (ccddebug & CCDB_FOLLOW)
747 kprintf("ccdopen(%x, %x)\n", dev, flags);
751 cs = &ccd_softc[unit];
753 if ((error = ccdlock(cs)) == 0) {
761 ccdclose(struct dev_close_args *ap)
763 cdev_t dev = ap->a_head.a_dev;
764 int unit = ccdunit(dev);
765 struct ccd_softc *cs;
769 if (ccddebug & CCDB_FOLLOW)
770 kprintf("ccdclose(%x, %x)\n", dev, flags);
775 cs = &ccd_softc[unit];
776 if ((error = ccdlock(cs)) == 0) {
783 ccdstrategy(struct dev_strategy_args *ap)
785 cdev_t dev = ap->a_head.a_dev;
786 struct bio *bio = ap->a_bio;
787 int unit = ccdunit(dev);
789 struct buf *bp = bio->bio_buf;
790 struct ccd_softc *cs = &ccd_softc[unit];
791 u_int64_t pbn; /* in sc_secsize chunks */
792 u_int32_t sz; /* in sc_secsize chunks */
795 if (ccddebug & CCDB_FOLLOW)
796 kprintf("ccdstrategy(%x): unit %d\n", bp, unit);
798 if ((cs->sc_flags & CCDF_INITED) == 0) {
803 /* If it's a nil transfer, wake up the top half now. */
804 if (bp->b_bcount == 0) {
810 * Do bounds checking and adjust transfer. If there's an
811 * error, the bounds check will flag that for us.
814 pbn = bio->bio_offset / cs->sc_geom.ccg_secsize;
815 sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
818 * If out of bounds return an error. If the request goes
819 * past EOF, clip the request as appropriate. If exactly
820 * at EOF, return success (don't clip), but with 0 bytes
823 * Mark EOF B_INVAL (just like bad), indicating that the
824 * contents of the buffer, if any, is invalid.
826 if ((int64_t)pbn < 0)
828 if (pbn + sz > cs->sc_size) {
829 if (pbn > cs->sc_size || (bp->b_flags & B_BNOCLIP))
831 if (pbn == cs->sc_size) {
832 bp->b_resid = bp->b_bcount;
833 bp->b_flags |= B_INVAL;
836 sz = (long)(cs->sc_size - pbn);
837 bp->b_bcount = sz * cs->sc_geom.ccg_secsize;
841 bp->b_resid = bp->b_bcount;
842 nbio->bio_driver_info = dev;
851 * note: bio, not nbio, is valid at the done label.
854 bp->b_error = EINVAL;
856 bp->b_resid = bp->b_bcount;
857 bp->b_flags |= B_ERROR | B_INVAL;
864 ccdstart(struct ccd_softc *cs, struct bio *bio)
867 struct ccdbuf *cbp[4];
868 struct buf *bp = bio->bio_buf;
869 /* XXX! : 2 reads and 2 writes for RAID 4/5 */
874 if (ccddebug & CCDB_FOLLOW)
875 kprintf("ccdstart(%x, %x)\n", cs, bp);
878 /* Record the transaction start */
879 devstat_start_transaction(&cs->device_stats);
882 * Allocate component buffers and fire off the requests
884 doffset = bio->bio_offset;
887 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
888 ccdbuffer(cbp, cs, bio, doffset, addr, bcount);
889 rcount = cbp[0]->cb_buf.b_bcount;
891 if (cs->sc_cflags & CCDF_MIRROR) {
893 * Mirroring. Writes go to both disks, reads are
894 * taken from whichever disk seems most appropriate.
896 * We attempt to localize reads to the disk whos arm
897 * is nearest the read request. We ignore seeks due
898 * to writes when making this determination and we
899 * also try to avoid hogging.
901 if (cbp[0]->cb_buf.b_cmd != BUF_CMD_READ) {
902 vn_strategy(cbp[0]->cb_vp,
903 &cbp[0]->cb_buf.b_bio1);
904 vn_strategy(cbp[1]->cb_vp,
905 &cbp[1]->cb_buf.b_bio1);
907 int pick = cs->sc_pick;
908 daddr_t range = cs->sc_size / 16 * cs->sc_geom.ccg_secsize;
909 if (doffset < cs->sc_blk[pick] - range ||
910 doffset > cs->sc_blk[pick] + range
912 cs->sc_pick = pick = 1 - pick;
914 cs->sc_blk[pick] = doffset + rcount;
915 vn_strategy(cbp[pick]->cb_vp,
916 &cbp[pick]->cb_buf.b_bio1);
922 vn_strategy(cbp[0]->cb_vp,
923 &cbp[0]->cb_buf.b_bio1);
931 * Build a component buffer header.
934 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct bio *bio,
935 off_t doffset, caddr_t addr, long bcount)
937 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
945 if (ccddebug & CCDB_IO)
946 kprintf("ccdbuffer(%x, %x, %d, %x, %d)\n",
947 cs, bp, bn, addr, bcount);
950 * Determine which component bn falls in.
952 bn = doffset / cs->sc_geom.ccg_secsize;
956 if (cs->sc_ileave == 0) {
958 * Serially concatenated and neither a mirror nor a parity
959 * config. This is a special case.
964 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
972 * Calculate cbn, the logical superblock (sc_ileave chunks),
973 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
976 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
977 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
980 * Figure out which interleave table to use.
982 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
983 if (ii->ii_startblk > cbn)
989 * off is the logical superblock relative to the beginning
990 * of this interleave block.
992 off = cbn - ii->ii_startblk;
995 * We must calculate which disk component to use (ccdisk),
996 * and recalculate cbn to be the superblock relative to
997 * the beginning of the component. This is typically done by
998 * adding 'off' and ii->ii_startoff together. However, 'off'
999 * must typically be divided by the number of components in
1000 * this interleave array to be properly convert it from a
1001 * CCD-relative logical superblock number to a
1002 * component-relative superblock number.
1004 if (ii->ii_ndisk == 1) {
1006 * When we have just one disk, it can't be a mirror
1007 * or a parity config.
1009 ccdisk = ii->ii_index[0];
1010 cbn = ii->ii_startoff + off;
1012 if (cs->sc_cflags & CCDF_MIRROR) {
1014 * We have forced a uniform mapping, resulting
1015 * in a single interleave array. We double
1016 * up on the first half of the available
1017 * components and our mirror is in the second
1018 * half. This only works with a single
1019 * interleave array because doubling up
1020 * doubles the number of sectors, so there
1021 * cannot be another interleave array because
1022 * the next interleave array's calculations
1025 int ndisk2 = ii->ii_ndisk / 2;
1026 ccdisk = ii->ii_index[off % ndisk2];
1027 cbn = ii->ii_startoff + off / ndisk2;
1028 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1029 } else if (cs->sc_cflags & CCDF_PARITY) {
1031 * XXX not implemented yet
1033 int ndisk2 = ii->ii_ndisk - 1;
1034 ccdisk = ii->ii_index[off % ndisk2];
1035 cbn = ii->ii_startoff + off / ndisk2;
1036 if (cbn % ii->ii_ndisk <= ccdisk)
1039 ccdisk = ii->ii_index[off % ii->ii_ndisk];
1040 cbn = ii->ii_startoff + off / ii->ii_ndisk;
1044 ci = &cs->sc_cinfo[ccdisk];
1047 * Convert cbn from a superblock to a normal block so it
1048 * can be used to calculate (along with cboff) the normal
1049 * block index into this particular disk.
1051 cbn *= cs->sc_ileave;
1055 * Fill in the component buf structure.
1057 * NOTE: devices do not use b_bufsize, only b_bcount, but b_bcount
1058 * will be truncated on device EOF so we use b_bufsize to detect
1062 cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1063 cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1064 cbp->cb_buf.b_data = addr;
1065 cbp->cb_vp = ci->ci_vp;
1066 if (cs->sc_ileave == 0)
1067 cbc = dbtob((off_t)(ci->ci_size - cbn));
1069 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1070 if (cbc > cs->sc_maxiosize)
1071 cbc = cs->sc_maxiosize;
1072 cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1073 cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1075 cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1076 cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1077 cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci->ci_skip);
1080 * context for ccdiodone
1083 cbp->cb_unit = cs - ccd_softc;
1084 cbp->cb_comp = ci - cs->sc_cinfo;
1087 if (ccddebug & CCDB_IO)
1088 kprintf(" dev %x(u%d): cbp %x off %lld addr %x bcnt %d\n",
1089 ci->ci_dev, ci-cs->sc_cinfo, cbp,
1090 cbp->cb_buf.b_bio1.bio_offset,
1091 cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1096 * Note: both I/O's setup when reading from mirror, but only one
1099 if (cs->sc_cflags & CCDF_MIRROR) {
1100 /* mirror, setup second I/O */
1103 cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1104 cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1105 cbp->cb_buf.b_data = addr;
1106 cbp->cb_vp = ci2->ci_vp;
1107 if (cs->sc_ileave == 0)
1108 cbc = dbtob((off_t)(ci->ci_size - cbn));
1110 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1111 if (cbc > cs->sc_maxiosize)
1112 cbc = cs->sc_maxiosize;
1113 cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1114 cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1116 cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1117 cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1118 cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci2->ci_skip);
1121 * context for ccdiodone
1124 cbp->cb_unit = cs - ccd_softc;
1125 cbp->cb_comp = ci2 - cs->sc_cinfo;
1127 /* link together the ccdbuf's and clear "mirror done" flag */
1128 cb[0]->cb_mirror = cb[1];
1129 cb[1]->cb_mirror = cb[0];
1130 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1131 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1136 ccdintr(struct ccd_softc *cs, struct bio *bio)
1138 struct buf *bp = bio->bio_buf;
1141 if (ccddebug & CCDB_FOLLOW)
1142 kprintf("ccdintr(%x, %x)\n", cs, bp);
1145 * Request is done for better or worse, wakeup the top half.
1147 if (bp->b_flags & B_ERROR)
1148 bp->b_resid = bp->b_bcount;
1149 devstat_end_transaction_buf(&cs->device_stats, bp);
1154 * Called at interrupt time.
1156 * Mark the component as done and if all components are done,
1157 * take a ccd interrupt.
1160 ccdiodone(struct bio *bio)
1162 struct ccdbuf *cbp = bio->bio_caller_info1.ptr;
1163 struct bio *obio = cbp->cb_obio;
1164 struct buf *obp = obio->bio_buf;
1165 int unit = cbp->cb_unit;
1166 struct ccd_softc *sc = &ccd_softc[unit];
1170 * Since we do not have exclusive access to underlying devices,
1171 * we can't keep cache translations around.
1173 clearbiocache(bio->bio_next);
1178 if (ccddebug & CCDB_FOLLOW)
1179 kprintf("ccdiodone(%x)\n", cbp);
1180 if (ccddebug & CCDB_IO) {
1181 kprintf("ccdiodone: bp %x bcount %d resid %d\n",
1182 obp, obp->b_bcount, obp->b_resid);
1183 kprintf(" dev %x(u%d), cbp %x off %lld addr %x bcnt %d\n",
1184 cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1185 cbp->cb_buf.b_loffset, cbp->cb_buf.b_data,
1186 cbp->cb_buf.b_bcount);
1191 * If an error occured, report it. If this is a mirrored
1192 * configuration and the first of two possible reads, do not
1193 * set the error in the bp yet because the second read may
1196 if (cbp->cb_buf.b_flags & B_ERROR) {
1197 const char *msg = "";
1199 if ((sc->sc_cflags & CCDF_MIRROR) &&
1200 (cbp->cb_buf.b_cmd == BUF_CMD_READ) &&
1201 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1203 * We will try our read on the other disk down
1204 * below, also reverse the default pick so if we
1205 * are doing a scan we do not keep hitting the
1208 msg = ", trying other disk";
1209 sc->sc_pick = 1 - sc->sc_pick;
1210 sc->sc_blk[sc->sc_pick] = obio->bio_offset;
1212 obp->b_flags |= B_ERROR;
1213 obp->b_error = cbp->cb_buf.b_error ?
1214 cbp->cb_buf.b_error : EIO;
1216 kprintf("ccd%d: error %d on component %d "
1217 "offset %jd (ccd offset %jd)%s\n",
1218 unit, obp->b_error, cbp->cb_comp,
1219 (intmax_t)cbp->cb_buf.b_bio2.bio_offset,
1220 (intmax_t)obio->bio_offset,
1225 * Process mirror. If we are writing, I/O has been initiated on both
1226 * buffers and we fall through only after both are finished.
1228 * If we are reading only one I/O is initiated at a time. If an
1229 * error occurs we initiate the second I/O and return, otherwise
1230 * we free the second I/O without initiating it.
1233 if (sc->sc_cflags & CCDF_MIRROR) {
1234 if (cbp->cb_buf.b_cmd != BUF_CMD_READ) {
1236 * When writing, handshake with the second buffer
1237 * to determine when both are done. If both are not
1238 * done, return here.
1240 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1241 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1248 * When reading, either dispose of the second buffer
1249 * or initiate I/O on the second buffer if an error
1250 * occured with this one.
1252 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1253 if (cbp->cb_buf.b_flags & B_ERROR) {
1254 cbp->cb_mirror->cb_pflags |=
1257 cbp->cb_mirror->cb_vp,
1258 &cbp->cb_mirror->cb_buf.b_bio1
1264 putccdbuf(cbp->cb_mirror);
1272 * Use our saved b_bufsize to determine if an unexpected EOF occured.
1274 count = cbp->cb_buf.b_bufsize;
1278 * If all done, "interrupt".
1280 obp->b_resid -= count;
1281 if (obp->b_resid < 0)
1282 panic("ccdiodone: count");
1286 if (obp->b_resid == 0)
1291 ccdioctl(struct dev_ioctl_args *ap)
1293 cdev_t dev = ap->a_head.a_dev;
1294 int unit = ccdunit(dev);
1295 int i, j, lookedup = 0, error = 0;
1296 struct ccd_softc *cs;
1297 struct ccd_ioctl *ccio = (struct ccd_ioctl *)ap->a_data;
1298 struct ccddevice ccd;
1299 struct disk_info info;
1305 cs = &ccd_softc[unit];
1307 bzero(&ccd, sizeof(ccd));
1309 switch (ap->a_cmd) {
1311 if (cs->sc_flags & CCDF_INITED)
1314 if ((ap->a_fflag & FWRITE) == 0)
1317 if ((error = ccdlock(cs)) != 0)
1320 if (ccio->ccio_ndisks > CCD_MAXNDISKS) {
1325 /* Fill in some important bits. */
1326 ccd.ccd_unit = unit;
1327 ccd.ccd_interleave = ccio->ccio_ileave;
1328 if (ccd.ccd_interleave == 0 &&
1329 ((ccio->ccio_flags & CCDF_MIRROR) ||
1330 (ccio->ccio_flags & CCDF_PARITY))) {
1331 kprintf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1332 ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1334 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1335 (ccio->ccio_flags & CCDF_PARITY)) {
1336 kprintf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1337 ccio->ccio_flags &= ~CCDF_PARITY;
1339 if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1340 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1341 kprintf("ccd%d: mirror/parity forces uniform flag\n",
1343 ccio->ccio_flags |= CCDF_UNIFORM;
1345 ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1348 * Allocate space for and copy in the array of
1349 * componet pathnames and device numbers.
1351 cpp = kmalloc(ccio->ccio_ndisks * sizeof(char *),
1352 M_DEVBUF, M_WAITOK);
1353 vpp = kmalloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1354 M_DEVBUF, M_WAITOK);
1356 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1357 ccio->ccio_ndisks * sizeof(char **));
1359 kfree(vpp, M_DEVBUF);
1360 kfree(cpp, M_DEVBUF);
1366 if (ccddebug & CCDB_INIT) {
1367 for (i = 0; i < ccio->ccio_ndisks; ++i)
1368 kprintf("ccdioctl: component %d: 0x%x\n",
1373 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1375 if (ccddebug & CCDB_INIT)
1376 kprintf("ccdioctl: lookedup = %d\n", lookedup);
1378 if ((error = ccdlookup(cpp[i], &vpp[i])) != 0) {
1379 for (j = 0; j < lookedup; ++j)
1380 (void)vn_close(vpp[j], FREAD|FWRITE);
1381 kfree(vpp, M_DEVBUF);
1382 kfree(cpp, M_DEVBUF);
1390 ccd.ccd_ndev = ccio->ccio_ndisks;
1393 * Initialize the ccd. Fills in the softc for us.
1395 if ((error = ccdinit(&ccd, cpp, ap->a_cred)) != 0) {
1396 for (j = 0; j < lookedup; ++j)
1397 vn_close(vpp[j], FREAD|FWRITE);
1398 kfree(vpp, M_DEVBUF);
1399 kfree(cpp, M_DEVBUF);
1405 * The ccd has been successfully initialized, so
1406 * we can place it into the array and read the disklabel.
1408 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1409 ccio->ccio_unit = unit;
1410 ccio->ccio_size = cs->sc_size;
1412 bzero(&info, sizeof(info));
1413 info.d_media_blksize = cs->sc_geom.ccg_secsize;
1414 info.d_media_blocks = cs->sc_size;
1415 info.d_nheads = cs->sc_geom.ccg_ntracks;
1416 info.d_secpertrack = cs->sc_geom.ccg_nsectors;
1417 info.d_ncylinders = cs->sc_geom.ccg_ncylinders;
1418 info.d_secpercyl = info.d_nheads * info.d_secpertrack;
1421 * For cases where a label is directly applied to the ccd,
1422 * without slices, DSO_COMPATMBR forces one sector be
1423 * reserved for backwards compatibility.
1425 info.d_dsflags = DSO_COMPATMBR;
1426 disk_setdiskinfo(&cs->sc_disk, &info);
1433 if ((cs->sc_flags & CCDF_INITED) == 0)
1436 if ((ap->a_fflag & FWRITE) == 0)
1439 if ((error = ccdlock(cs)) != 0)
1442 if (dev_drefs(cs->sc_dev) > 1) {
1448 * Free ccd_softc information and clear entry.
1451 /* Close the components and free their pathnames. */
1452 for (i = 0; i < cs->sc_nccdisks; ++i) {
1454 * XXX: this close could potentially fail and
1455 * cause Bad Things. Maybe we need to force
1456 * the close to happen?
1459 if (ccddebug & CCDB_VNODE)
1460 vprint("CCDIOCCLR: vnode info",
1461 cs->sc_cinfo[i].ci_vp);
1463 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE);
1464 kfree(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1467 /* Free interleave index. */
1468 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1469 kfree(cs->sc_itable[i].ii_index, M_DEVBUF);
1471 /* Free component info and interleave table. */
1472 kfree(cs->sc_cinfo, M_DEVBUF);
1473 kfree(cs->sc_itable, M_DEVBUF);
1474 cs->sc_cinfo = NULL;
1475 cs->sc_itable = NULL;
1476 cs->sc_flags &= ~CCDF_INITED;
1479 * Free ccddevice information and clear entry.
1481 kfree(ccddevs[unit].ccd_cpp, M_DEVBUF);
1482 kfree(ccddevs[unit].ccd_vpp, M_DEVBUF);
1483 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1486 * And remove the devstat entry.
1488 devstat_remove_entry(&cs->device_stats);
1502 ccddump(struct dev_dump_args *ap)
1504 /* Not implemented. */
1509 * Lookup the provided name in the filesystem. If the file exists,
1510 * is a valid block device, and isn't being used by anyone else,
1511 * set *vpp to the file's vnode.
1514 ccdlookup(char *path, struct vnode **vpp)
1516 struct nlookupdata nd;
1522 error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1525 if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1527 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
1528 kprintf("ccdlookup: vn_open error = %d\n", error);
1534 if (vp->v_opencount > 1) {
1539 if (!vn_isdisk(vp, &error))
1543 if (ccddebug & CCDB_VNODE)
1544 vprint("ccdlookup: vnode info", vp);
1548 nd.nl_open_vp = NULL;
1550 *vpp = vp; /* leave ref intact */
1558 * Wait interruptibly for an exclusive lock.
1561 ccdlock(struct ccd_softc *cs)
1563 lockmgr(&cs->sc_lock, LK_EXCLUSIVE);
1569 * Unlock and wake up any waiters.
1572 ccdunlock(struct ccd_softc *cs)
1574 lockmgr(&cs->sc_lock, LK_RELEASE);
1579 printiinfo(struct ccdiinfo *ii)
1583 for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1584 kprintf(" itab[%d]: #dk %d sblk %d soff %d",
1585 ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1586 for (i = 0; i < ii->ii_ndisk; i++)
1587 kprintf(" %d", ii->ii_index[i]);