2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.48 2007/06/19 19:09:46 dillon Exp $
37 * Copyright (c) 1995 Jason R. Thorpe.
38 * All rights reserved.
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution.
48 * 3. All advertising materials mentioning features or use of this software
49 * must display the following acknowledgement:
50 * This product includes software developed for the NetBSD Project
52 * 4. The name of the author may not be used to endorse or promote products
53 * derived from this software without specific prior written permission.
55 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
56 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
57 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
58 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
59 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
60 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
61 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
62 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
63 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
69 * Copyright (c) 1988 University of Utah.
70 * Copyright (c) 1990, 1993
71 * The Regents of the University of California. All rights reserved.
73 * This code is derived from software contributed to Berkeley by
74 * the Systems Programming Group of the University of Utah Computer
77 * Redistribution and use in source and binary forms, with or without
78 * modification, are permitted provided that the following conditions
80 * 1. Redistributions of source code must retain the above copyright
81 * notice, this list of conditions and the following disclaimer.
82 * 2. Redistributions in binary form must reproduce the above copyright
83 * notice, this list of conditions and the following disclaimer in the
84 * documentation and/or other materials provided with the distribution.
85 * 3. All advertising materials mentioning features or use of this software
86 * must display the following acknowledgement:
87 * This product includes software developed by the University of
88 * California, Berkeley and its contributors.
89 * 4. Neither the name of the University nor the names of its contributors
90 * may be used to endorse or promote products derived from this software
91 * without specific prior written permission.
93 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
94 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
95 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
96 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
97 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
98 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
99 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
101 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
102 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
105 * from: Utah $Hdr: cd.c 1.6 90/11/28$
108 * @(#)cd.c 8.2 (Berkeley) 11/16/93
109 * $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $
110 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
111 * $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.48 2007/06/19 19:09:46 dillon Exp $
115 * "Concatenated" disk driver.
117 * Original dynamic configuration support by:
118 * Jason R. Thorpe <thorpej@nas.nasa.gov>
119 * Numerical Aerodynamic Simulation Facility
121 * NASA Ames Research Center
122 * Moffett Field, CA 94035
127 #include <sys/param.h>
128 #include <sys/systm.h>
129 #include <sys/kernel.h>
130 #include <sys/module.h>
131 #include <sys/proc.h>
133 #include <sys/malloc.h>
134 #include <sys/nlookup.h>
135 #include <sys/conf.h>
136 #include <sys/stat.h>
137 #include <sys/sysctl.h>
138 #include <sys/disk.h>
139 #include <sys/dtype.h>
140 #include <sys/diskslice.h>
141 #include <sys/devicestat.h>
142 #include <sys/fcntl.h>
143 #include <sys/vnode.h>
144 #include <sys/buf2.h>
145 #include <sys/ccdvar.h>
147 #include <vm/vm_zone.h>
149 #include <vfs/ufs/dinode.h> /* XXX Used only for fs.h */
150 #include <vfs/ufs/fs.h> /* XXX used only to get BBSIZE and SBSIZE */
152 #include <sys/thread2.h>
154 #if defined(CCDDEBUG) && !defined(DEBUG)
159 #define CCDB_FOLLOW 0x01
160 #define CCDB_INIT 0x02
162 #define CCDB_LABEL 0x08
163 #define CCDB_VNODE 0x10
164 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
166 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
170 #define ccdunit(x) dkunit(x)
171 #define ccdpart(x) dkpart(x)
174 This is how mirroring works (only writes are special):
176 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
177 linked together by the cb_mirror field. "cb_pflags &
178 CCDPF_MIRROR_DONE" is set to 0 on both of them.
180 When a component returns to ccdiodone(), it checks if "cb_pflags &
181 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
182 flag and returns. If it is, it means its partner has already
183 returned, so it will go to the regular cleanup.
188 struct buf cb_buf; /* new I/O buf */
189 struct vnode *cb_vp; /* related vnode */
190 struct bio *cb_obio; /* ptr. to original I/O buf */
191 struct ccdbuf *cb_freenext; /* free list link */
192 int cb_unit; /* target unit */
193 int cb_comp; /* target component */
194 int cb_pflags; /* mirror/parity status flag */
195 struct ccdbuf *cb_mirror; /* mirror counterpart */
198 /* bits in cb_pflags */
199 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
201 static d_open_t ccdopen;
202 static d_close_t ccdclose;
203 static d_strategy_t ccdstrategy;
204 static d_ioctl_t ccdioctl;
205 static d_dump_t ccddump;
207 #define NCCDFREEHIWAT 16
209 #define CDEV_MAJOR 74
211 static struct dev_ops ccd_ops = {
212 { "ccd", CDEV_MAJOR, D_DISK },
216 .d_write = physwrite,
218 .d_strategy = ccdstrategy,
222 /* called during module initialization */
223 static void ccdattach (void);
224 static int ccd_modevent (module_t, int, void *);
226 /* called by biodone() at interrupt time */
227 static void ccdiodone (struct bio *bio);
229 static void ccdstart (struct ccd_softc *, struct bio *);
230 static void ccdinterleave (struct ccd_softc *, int);
231 static void ccdintr (struct ccd_softc *, struct bio *);
232 static int ccdinit (struct ccddevice *, char **, struct ucred *);
233 static int ccdlookup (char *, struct vnode **);
234 static void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
235 struct bio *, off_t, caddr_t, long);
236 static int ccdlock (struct ccd_softc *);
237 static void ccdunlock (struct ccd_softc *);
240 static void printiinfo (struct ccdiinfo *);
243 /* Non-private for the benefit of libkvm. */
244 struct ccd_softc *ccd_softc;
245 struct ccddevice *ccddevs;
246 struct ccdbuf *ccdfreebufs;
247 static int numccdfreebufs;
248 static int numccd = 0;
251 * getccdbuf() - Allocate and zero a ccd buffer.
253 * This routine is called at splbio().
263 * Allocate from freelist or malloc as necessary
265 if ((cbp = ccdfreebufs) != NULL) {
266 ccdfreebufs = cbp->cb_freenext;
268 reinitbufbio(&cbp->cb_buf);
270 cbp = kmalloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK|M_ZERO);
271 initbufbio(&cbp->cb_buf);
275 * independant struct buf initialization
277 LIST_INIT(&cbp->cb_buf.b_dep);
278 BUF_LOCKINIT(&cbp->cb_buf);
279 BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
280 BUF_KERNPROC(&cbp->cb_buf);
281 cbp->cb_buf.b_flags = B_PAGING | B_BNOCLIP;
287 * putccdbuf() - Free a ccd buffer.
289 * This routine is called at splbio().
294 putccdbuf(struct ccdbuf *cbp)
296 BUF_UNLOCK(&cbp->cb_buf);
297 BUF_LOCKFREE(&cbp->cb_buf);
299 if (numccdfreebufs < NCCDFREEHIWAT) {
300 cbp->cb_freenext = ccdfreebufs;
304 kfree((caddr_t)cbp, M_DEVBUF);
309 * Called by main() during pseudo-device attachment. All we need
310 * to do is allocate enough space for devices to be configured later, and
316 struct disk_info info;
317 struct ccd_softc *cs;
322 kprintf("ccd0-%d: Concatenated disk drivers\n", num-1);
324 kprintf("ccd0: Concatenated disk driver\n");
326 ccd_softc = kmalloc(num * sizeof(struct ccd_softc), M_DEVBUF,
328 ccddevs = kmalloc(num * sizeof(struct ccddevice), M_DEVBUF,
333 * With normal disk devices the open simply fails if the media
334 * is not present. With CCD we have to be able to open the
335 * raw disk to use the ioctl's to set it up, so create a dummy
336 * disk info structure so dscheck() doesn't blow up.
338 bzero(&info, sizeof(info));
339 info.d_media_blksize = DEV_BSIZE;
341 for (i = 0; i < numccd; ++i) {
343 cs->sc_dev = disk_create(i, &cs->sc_disk, &ccd_ops);
344 cs->sc_dev->si_drv1 = cs;
345 cs->sc_dev->si_iosize_max = 256 * 512; /* XXX */
346 disk_setdiskinfo(&cs->sc_disk, &info);
351 ccd_modevent(module_t mod, int type, void *data)
361 kprintf("ccd0: Unload not supported!\n");
365 default: /* MOD_SHUTDOWN etc */
371 DEV_MODULE(ccd, ccd_modevent, NULL);
374 ccdinit(struct ccddevice *ccd, char **cpaths, struct ucred *cred)
376 struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
377 struct ccdcinfo *ci = NULL; /* XXX */
384 struct partinfo dpart;
385 struct ccdgeom *ccg = &cs->sc_geom;
386 char tmppath[MAXPATHLEN];
390 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
391 kprintf("ccdinit: unit %d\n", ccd->ccd_unit);
395 cs->sc_ileave = ccd->ccd_interleave;
396 cs->sc_nccdisks = ccd->ccd_ndev;
398 /* Allocate space for the component info. */
399 cs->sc_cinfo = kmalloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
403 * Verify that each component piece exists and record
404 * relevant information about it.
408 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
409 vp = ccd->ccd_vpp[ix];
410 ci = &cs->sc_cinfo[ix];
414 * Copy in the pathname of the component.
416 bzero(tmppath, sizeof(tmppath)); /* sanity */
417 if ((error = copyinstr(cpaths[ix], tmppath,
418 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
420 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
421 kprintf("ccd%d: can't copy path, error = %d\n",
422 ccd->ccd_unit, error);
426 ci->ci_path = kmalloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
427 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
429 ci->ci_dev = vn_todev(vp);
432 * Get partition information for the component.
434 error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart, FREAD, cred);
437 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
438 kprintf("ccd%d: %s: ioctl failed, error = %d\n",
439 ccd->ccd_unit, ci->ci_path, error);
443 if (dpart.fstype != FS_CCD &&
444 !kuuid_is_ccd(&dpart.fstype_uuid)) {
445 kprintf("ccd%d: %s: filesystem type must be 'ccd'\n",
446 ccd->ccd_unit, ci->ci_path);
450 if (maxsecsize < dpart.media_blksize)
451 maxsecsize = dpart.media_blksize;
454 * Skip a certain amount of storage at the beginning of
455 * the component to make sure we don't infringe on any
456 * reserved sectors. This is handled entirely by
457 * dpart.reserved_blocks but we also impose a minimum
458 * of 16 sectors for backwards compatibility.
461 if (skip < dpart.reserved_blocks)
462 skip = dpart.reserved_blocks;
463 size = dpart.media_blocks - skip;
466 * Calculate the size, truncating to an interleave
467 * boundary if necessary.
469 if (cs->sc_ileave > 1)
470 size -= size % cs->sc_ileave;
472 if ((int64_t)size <= 0) {
474 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
475 kprintf("ccd%d: %s: size == 0\n",
476 ccd->ccd_unit, ci->ci_path);
483 * Calculate the smallest uniform component, used
486 if (minsize == 0 || minsize > size)
494 * Don't allow the interleave to be smaller than
495 * the biggest component sector.
497 if ((cs->sc_ileave > 0) &&
498 (cs->sc_ileave % (maxsecsize / DEV_BSIZE))) {
500 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
501 kprintf("ccd%d: interleave must be at least %d\n",
502 ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
509 * If uniform interleave is desired set all sizes to that of
510 * the smallest component. This will guarentee that a single
511 * interleave table is generated.
513 * Lost space must be taken into account when calculating the
514 * overall size. Half the space is lost when CCDF_MIRROR is
515 * specified. One disk is lost when CCDF_PARITY is specified.
517 if (ccd->ccd_flags & CCDF_UNIFORM) {
518 for (ci = cs->sc_cinfo;
519 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
520 ci->ci_size = minsize;
522 if (ccd->ccd_flags & CCDF_MIRROR) {
524 * Check to see if an even number of components
525 * have been specified. The interleave must also
526 * be non-zero in order for us to be able to
527 * guarentee the topology.
529 if (cs->sc_nccdisks % 2) {
530 kprintf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
534 if (cs->sc_ileave == 0) {
535 kprintf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
539 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
540 } else if (ccd->ccd_flags & CCDF_PARITY) {
541 cs->sc_size = (cs->sc_nccdisks-1) * minsize;
543 if (cs->sc_ileave == 0) {
544 kprintf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
548 cs->sc_size = cs->sc_nccdisks * minsize;
553 * Construct the interleave table.
555 ccdinterleave(cs, ccd->ccd_unit);
558 * Create pseudo-geometry based on 1MB cylinders. It's
561 ccg->ccg_secsize = maxsecsize;
562 ccg->ccg_ntracks = 1;
563 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
564 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
567 * Add an devstat entry for this device.
569 devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
570 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
571 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
572 DEVSTAT_PRIORITY_ARRAY);
574 cs->sc_flags |= CCDF_INITED;
575 cs->sc_cflags = ccd->ccd_flags; /* So we can find out later... */
576 cs->sc_unit = ccd->ccd_unit;
579 while (ci > cs->sc_cinfo) {
581 kfree(ci->ci_path, M_DEVBUF);
583 kfree(cs->sc_cinfo, M_DEVBUF);
589 ccdinterleave(struct ccd_softc *cs, int unit)
591 struct ccdcinfo *ci, *smallci;
600 if (ccddebug & CCDB_INIT)
601 kprintf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
605 * Allocate an interleave table. The worst case occurs when each
606 * of N disks is of a different size, resulting in N interleave
609 * Chances are this is too big, but we don't care.
611 icount = cs->sc_nccdisks + 1;
612 cs->sc_itable = kmalloc(icount * sizeof(struct ccdiinfo),
613 M_DEVBUF, M_WAITOK|M_ZERO);
616 * Trivial case: no interleave (actually interleave of disk size).
617 * Each table entry represents a single component in its entirety.
619 * An interleave of 0 may not be used with a mirror or parity setup.
621 if (cs->sc_ileave == 0) {
625 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
626 /* Allocate space for ii_index. */
627 ii->ii_index = kmalloc(sizeof(int), M_DEVBUF, M_WAITOK);
629 ii->ii_startblk = bn;
631 ii->ii_index[0] = ix;
632 bn += cs->sc_cinfo[ix].ci_size;
637 if (ccddebug & CCDB_INIT)
638 printiinfo(cs->sc_itable);
644 * The following isn't fast or pretty; it doesn't have to be.
648 for (ii = cs->sc_itable; ii < &cs->sc_itable[icount]; ++ii) {
650 * Allocate space for ii_index. We might allocate more then
653 ii->ii_index = kmalloc((sizeof(int) * cs->sc_nccdisks),
657 * Locate the smallest of the remaining components
661 while (ci < &cs->sc_cinfo[cs->sc_nccdisks]) {
662 if (ci->ci_size > size &&
664 ci->ci_size < smallci->ci_size)) {
671 * Nobody left, all done
673 if (smallci == NULL) {
679 * Record starting logical block using an sc_ileave blocksize.
681 ii->ii_startblk = bn / cs->sc_ileave;
684 * Record starting component block using an sc_ileave
685 * blocksize. This value is relative to the beginning of
688 ii->ii_startoff = lbn;
691 * Determine how many disks take part in this interleave
692 * and record their indices.
695 for (ci = cs->sc_cinfo;
696 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
697 if (ci->ci_size >= smallci->ci_size) {
698 ii->ii_index[ix++] = ci - cs->sc_cinfo;
706 bn += ix * (smallci->ci_size - size);
707 lbn = smallci->ci_size / cs->sc_ileave;
708 size = smallci->ci_size;
710 if (ii == &cs->sc_itable[icount])
711 panic("ccdinterlave software bug! table exhausted");
713 if (ccddebug & CCDB_INIT)
714 printiinfo(cs->sc_itable);
720 ccdopen(struct dev_open_args *ap)
722 cdev_t dev = ap->a_head.a_dev;
723 int unit = ccdunit(dev);
724 struct ccd_softc *cs;
728 if (ccddebug & CCDB_FOLLOW)
729 kprintf("ccdopen(%x, %x)\n", dev, flags);
733 cs = &ccd_softc[unit];
735 if ((error = ccdlock(cs)) == 0) {
743 ccdclose(struct dev_close_args *ap)
745 cdev_t dev = ap->a_head.a_dev;
746 int unit = ccdunit(dev);
747 struct ccd_softc *cs;
751 if (ccddebug & CCDB_FOLLOW)
752 kprintf("ccdclose(%x, %x)\n", dev, flags);
757 cs = &ccd_softc[unit];
758 if ((error = ccdlock(cs)) == 0) {
765 ccdstrategy(struct dev_strategy_args *ap)
767 cdev_t dev = ap->a_head.a_dev;
768 struct bio *bio = ap->a_bio;
769 int unit = ccdunit(dev);
771 struct buf *bp = bio->bio_buf;
772 struct ccd_softc *cs = &ccd_softc[unit];
773 u_int64_t pbn; /* in sc_secsize chunks */
774 u_int32_t sz; /* in sc_secsize chunks */
777 if (ccddebug & CCDB_FOLLOW)
778 kprintf("ccdstrategy(%x): unit %d\n", bp, unit);
780 if ((cs->sc_flags & CCDF_INITED) == 0) {
785 /* If it's a nil transfer, wake up the top half now. */
786 if (bp->b_bcount == 0) {
792 * Do bounds checking and adjust transfer. If there's an
793 * error, the bounds check will flag that for us.
796 pbn = bio->bio_offset / cs->sc_geom.ccg_secsize;
797 sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
800 * If out of bounds return an error. If the request goes
801 * past EOF, clip the request as appropriate. If exactly
802 * at EOF, return success (don't clip), but with 0 bytes
805 * Mark EOF B_INVAL (just like bad), indicating that the
806 * contents of the buffer, if any, is invalid.
808 if ((int64_t)pbn < 0)
810 if (pbn + sz > cs->sc_size) {
811 if (pbn > cs->sc_size || (bp->b_flags & B_BNOCLIP))
813 if (pbn == cs->sc_size) {
814 bp->b_resid = bp->b_bcount;
815 bp->b_flags |= B_INVAL;
818 sz = (long)(cs->sc_size - pbn);
819 bp->b_bcount = sz * cs->sc_geom.ccg_secsize;
823 bp->b_resid = bp->b_bcount;
824 nbio->bio_driver_info = dev;
835 * note: bio, not nbio, is valid at the done label.
838 bp->b_error = EINVAL;
840 bp->b_resid = bp->b_bcount;
841 bp->b_flags |= B_ERROR | B_INVAL;
848 ccdstart(struct ccd_softc *cs, struct bio *bio)
851 struct ccdbuf *cbp[4];
852 struct buf *bp = bio->bio_buf;
853 /* XXX! : 2 reads and 2 writes for RAID 4/5 */
858 if (ccddebug & CCDB_FOLLOW)
859 kprintf("ccdstart(%x, %x)\n", cs, bp);
862 /* Record the transaction start */
863 devstat_start_transaction(&cs->device_stats);
866 * Allocate component buffers and fire off the requests
868 doffset = bio->bio_offset;
871 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
872 ccdbuffer(cbp, cs, bio, doffset, addr, bcount);
873 rcount = cbp[0]->cb_buf.b_bcount;
875 if (cs->sc_cflags & CCDF_MIRROR) {
877 * Mirroring. Writes go to both disks, reads are
878 * taken from whichever disk seems most appropriate.
880 * We attempt to localize reads to the disk whos arm
881 * is nearest the read request. We ignore seeks due
882 * to writes when making this determination and we
883 * also try to avoid hogging.
885 if (cbp[0]->cb_buf.b_cmd != BUF_CMD_READ) {
886 vn_strategy(cbp[0]->cb_vp,
887 &cbp[0]->cb_buf.b_bio1);
888 vn_strategy(cbp[1]->cb_vp,
889 &cbp[1]->cb_buf.b_bio1);
891 int pick = cs->sc_pick;
892 daddr_t range = cs->sc_size / 16 * cs->sc_geom.ccg_secsize;
893 if (doffset < cs->sc_blk[pick] - range ||
894 doffset > cs->sc_blk[pick] + range
896 cs->sc_pick = pick = 1 - pick;
898 cs->sc_blk[pick] = doffset + rcount;
899 vn_strategy(cbp[pick]->cb_vp,
900 &cbp[pick]->cb_buf.b_bio1);
906 vn_strategy(cbp[0]->cb_vp,
907 &cbp[0]->cb_buf.b_bio1);
915 * Build a component buffer header.
918 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct bio *bio,
919 off_t doffset, caddr_t addr, long bcount)
921 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
929 if (ccddebug & CCDB_IO)
930 kprintf("ccdbuffer(%x, %x, %d, %x, %d)\n",
931 cs, bp, bn, addr, bcount);
934 * Determine which component bn falls in.
936 bn = doffset / cs->sc_geom.ccg_secsize;
940 if (cs->sc_ileave == 0) {
942 * Serially concatenated and neither a mirror nor a parity
943 * config. This is a special case.
948 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
956 * Calculate cbn, the logical superblock (sc_ileave chunks),
957 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
960 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
961 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
964 * Figure out which interleave table to use.
966 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
967 if (ii->ii_startblk > cbn)
973 * off is the logical superblock relative to the beginning
974 * of this interleave block.
976 off = cbn - ii->ii_startblk;
979 * We must calculate which disk component to use (ccdisk),
980 * and recalculate cbn to be the superblock relative to
981 * the beginning of the component. This is typically done by
982 * adding 'off' and ii->ii_startoff together. However, 'off'
983 * must typically be divided by the number of components in
984 * this interleave array to be properly convert it from a
985 * CCD-relative logical superblock number to a
986 * component-relative superblock number.
988 if (ii->ii_ndisk == 1) {
990 * When we have just one disk, it can't be a mirror
991 * or a parity config.
993 ccdisk = ii->ii_index[0];
994 cbn = ii->ii_startoff + off;
996 if (cs->sc_cflags & CCDF_MIRROR) {
998 * We have forced a uniform mapping, resulting
999 * in a single interleave array. We double
1000 * up on the first half of the available
1001 * components and our mirror is in the second
1002 * half. This only works with a single
1003 * interleave array because doubling up
1004 * doubles the number of sectors, so there
1005 * cannot be another interleave array because
1006 * the next interleave array's calculations
1009 int ndisk2 = ii->ii_ndisk / 2;
1010 ccdisk = ii->ii_index[off % ndisk2];
1011 cbn = ii->ii_startoff + off / ndisk2;
1012 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1013 } else if (cs->sc_cflags & CCDF_PARITY) {
1015 * XXX not implemented yet
1017 int ndisk2 = ii->ii_ndisk - 1;
1018 ccdisk = ii->ii_index[off % ndisk2];
1019 cbn = ii->ii_startoff + off / ndisk2;
1020 if (cbn % ii->ii_ndisk <= ccdisk)
1023 ccdisk = ii->ii_index[off % ii->ii_ndisk];
1024 cbn = ii->ii_startoff + off / ii->ii_ndisk;
1028 ci = &cs->sc_cinfo[ccdisk];
1031 * Convert cbn from a superblock to a normal block so it
1032 * can be used to calculate (along with cboff) the normal
1033 * block index into this particular disk.
1035 cbn *= cs->sc_ileave;
1039 * Fill in the component buf structure.
1041 * NOTE: devices do not use b_bufsize, only b_bcount, but b_bcount
1042 * will be truncated on device EOF so we use b_bufsize to detect
1046 cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1047 cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1048 cbp->cb_buf.b_data = addr;
1049 cbp->cb_vp = ci->ci_vp;
1050 if (cs->sc_ileave == 0)
1051 cbc = dbtob((off_t)(ci->ci_size - cbn));
1053 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1054 cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1055 cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1057 cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1058 cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1059 cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci->ci_skip);
1062 * context for ccdiodone
1065 cbp->cb_unit = cs - ccd_softc;
1066 cbp->cb_comp = ci - cs->sc_cinfo;
1069 if (ccddebug & CCDB_IO)
1070 kprintf(" dev %x(u%d): cbp %x off %lld addr %x bcnt %d\n",
1071 ci->ci_dev, ci-cs->sc_cinfo, cbp,
1072 cbp->cb_buf.b_bio1.bio_offset,
1073 cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1078 * Note: both I/O's setup when reading from mirror, but only one
1081 if (cs->sc_cflags & CCDF_MIRROR) {
1082 /* mirror, setup second I/O */
1085 cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1086 cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1087 cbp->cb_buf.b_data = addr;
1088 cbp->cb_vp = ci2->ci_vp;
1089 if (cs->sc_ileave == 0)
1090 cbc = dbtob((off_t)(ci->ci_size - cbn));
1092 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1093 cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1094 cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1096 cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1097 cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1098 cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci2->ci_skip);
1101 * context for ccdiodone
1104 cbp->cb_unit = cs - ccd_softc;
1105 cbp->cb_comp = ci2 - cs->sc_cinfo;
1107 /* link together the ccdbuf's and clear "mirror done" flag */
1108 cb[0]->cb_mirror = cb[1];
1109 cb[1]->cb_mirror = cb[0];
1110 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1111 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1116 ccdintr(struct ccd_softc *cs, struct bio *bio)
1118 struct buf *bp = bio->bio_buf;
1121 if (ccddebug & CCDB_FOLLOW)
1122 kprintf("ccdintr(%x, %x)\n", cs, bp);
1125 * Request is done for better or worse, wakeup the top half.
1127 if (bp->b_flags & B_ERROR)
1128 bp->b_resid = bp->b_bcount;
1129 devstat_end_transaction_buf(&cs->device_stats, bp);
1134 * Called at interrupt time.
1135 * Mark the component as done and if all components are done,
1136 * take a ccd interrupt.
1139 ccdiodone(struct bio *bio)
1141 struct ccdbuf *cbp = bio->bio_caller_info1.ptr;
1142 struct bio *obio = cbp->cb_obio;
1143 struct buf *obp = obio->bio_buf;
1144 int unit = cbp->cb_unit;
1148 * Since we do not have exclusive access to underlying devices,
1149 * we can't keep cache translations around.
1151 clearbiocache(bio->bio_next);
1155 if (ccddebug & CCDB_FOLLOW)
1156 kprintf("ccdiodone(%x)\n", cbp);
1157 if (ccddebug & CCDB_IO) {
1158 kprintf("ccdiodone: bp %x bcount %d resid %d\n",
1159 obp, obp->b_bcount, obp->b_resid);
1160 kprintf(" dev %x(u%d), cbp %x off %lld addr %x bcnt %d\n",
1161 cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1162 cbp->cb_buf.b_loffset, cbp->cb_buf.b_data,
1163 cbp->cb_buf.b_bcount);
1168 * If an error occured, report it. If this is a mirrored
1169 * configuration and the first of two possible reads, do not
1170 * set the error in the bp yet because the second read may
1173 if (cbp->cb_buf.b_flags & B_ERROR) {
1174 const char *msg = "";
1176 if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1177 (cbp->cb_buf.b_cmd == BUF_CMD_READ) &&
1178 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1180 * We will try our read on the other disk down
1181 * below, also reverse the default pick so if we
1182 * are doing a scan we do not keep hitting the
1185 struct ccd_softc *cs = &ccd_softc[unit];
1187 msg = ", trying other disk";
1188 cs->sc_pick = 1 - cs->sc_pick;
1189 cs->sc_blk[cs->sc_pick] = obio->bio_offset;
1191 obp->b_flags |= B_ERROR;
1192 obp->b_error = cbp->cb_buf.b_error ?
1193 cbp->cb_buf.b_error : EIO;
1195 kprintf("ccd%d: error %d on component %d offset %lld (ccd offset %lld)%s\n",
1196 unit, obp->b_error, cbp->cb_comp,
1197 cbp->cb_buf.b_bio2.bio_offset,
1198 obio->bio_offset, msg);
1202 * Process mirror. If we are writing, I/O has been initiated on both
1203 * buffers and we fall through only after both are finished.
1205 * If we are reading only one I/O is initiated at a time. If an
1206 * error occurs we initiate the second I/O and return, otherwise
1207 * we free the second I/O without initiating it.
1210 if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1211 if (cbp->cb_buf.b_cmd != BUF_CMD_READ) {
1213 * When writing, handshake with the second buffer
1214 * to determine when both are done. If both are not
1215 * done, return here.
1217 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1218 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1225 * When reading, either dispose of the second buffer
1226 * or initiate I/O on the second buffer if an error
1227 * occured with this one.
1229 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1230 if (cbp->cb_buf.b_flags & B_ERROR) {
1231 cbp->cb_mirror->cb_pflags |=
1234 cbp->cb_mirror->cb_vp,
1235 &cbp->cb_mirror->cb_buf.b_bio1
1241 putccdbuf(cbp->cb_mirror);
1249 * Use our saved b_bufsize to determine if an unexpected EOF occured.
1251 count = cbp->cb_buf.b_bufsize;
1255 * If all done, "interrupt".
1257 obp->b_resid -= count;
1258 if (obp->b_resid < 0)
1259 panic("ccdiodone: count");
1260 if (obp->b_resid == 0)
1261 ccdintr(&ccd_softc[unit], obio);
1266 ccdioctl(struct dev_ioctl_args *ap)
1268 cdev_t dev = ap->a_head.a_dev;
1269 int unit = ccdunit(dev);
1270 int i, j, lookedup = 0, error = 0;
1271 struct ccd_softc *cs;
1272 struct ccd_ioctl *ccio = (struct ccd_ioctl *)ap->a_data;
1273 struct ccddevice ccd;
1274 struct disk_info info;
1280 cs = &ccd_softc[unit];
1282 bzero(&ccd, sizeof(ccd));
1284 switch (ap->a_cmd) {
1286 if (cs->sc_flags & CCDF_INITED)
1289 if ((ap->a_fflag & FWRITE) == 0)
1292 if ((error = ccdlock(cs)) != 0)
1295 if (ccio->ccio_ndisks > CCD_MAXNDISKS) {
1300 /* Fill in some important bits. */
1301 ccd.ccd_unit = unit;
1302 ccd.ccd_interleave = ccio->ccio_ileave;
1303 if (ccd.ccd_interleave == 0 &&
1304 ((ccio->ccio_flags & CCDF_MIRROR) ||
1305 (ccio->ccio_flags & CCDF_PARITY))) {
1306 kprintf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1307 ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1309 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1310 (ccio->ccio_flags & CCDF_PARITY)) {
1311 kprintf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1312 ccio->ccio_flags &= ~CCDF_PARITY;
1314 if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1315 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1316 kprintf("ccd%d: mirror/parity forces uniform flag\n",
1318 ccio->ccio_flags |= CCDF_UNIFORM;
1320 ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1323 * Allocate space for and copy in the array of
1324 * componet pathnames and device numbers.
1326 cpp = kmalloc(ccio->ccio_ndisks * sizeof(char *),
1327 M_DEVBUF, M_WAITOK);
1328 vpp = kmalloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1329 M_DEVBUF, M_WAITOK);
1331 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1332 ccio->ccio_ndisks * sizeof(char **));
1334 kfree(vpp, M_DEVBUF);
1335 kfree(cpp, M_DEVBUF);
1341 if (ccddebug & CCDB_INIT) {
1342 for (i = 0; i < ccio->ccio_ndisks; ++i)
1343 kprintf("ccdioctl: component %d: 0x%x\n",
1348 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1350 if (ccddebug & CCDB_INIT)
1351 kprintf("ccdioctl: lookedup = %d\n", lookedup);
1353 if ((error = ccdlookup(cpp[i], &vpp[i])) != 0) {
1354 for (j = 0; j < lookedup; ++j)
1355 (void)vn_close(vpp[j], FREAD|FWRITE);
1356 kfree(vpp, M_DEVBUF);
1357 kfree(cpp, M_DEVBUF);
1365 ccd.ccd_ndev = ccio->ccio_ndisks;
1368 * Initialize the ccd. Fills in the softc for us.
1370 if ((error = ccdinit(&ccd, cpp, ap->a_cred)) != 0) {
1371 for (j = 0; j < lookedup; ++j)
1372 (void)vn_close(vpp[j], FREAD|FWRITE);
1373 kfree(vpp, M_DEVBUF);
1374 kfree(cpp, M_DEVBUF);
1380 * The ccd has been successfully initialized, so
1381 * we can place it into the array and read the disklabel.
1383 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1384 ccio->ccio_unit = unit;
1385 ccio->ccio_size = cs->sc_size;
1387 bzero(&info, sizeof(info));
1388 info.d_media_blksize = cs->sc_geom.ccg_secsize;
1389 info.d_media_blocks = cs->sc_size;
1390 info.d_nheads = cs->sc_geom.ccg_ntracks;
1391 info.d_secpertrack = cs->sc_geom.ccg_nsectors;
1392 info.d_ncylinders = cs->sc_geom.ccg_ncylinders;
1393 info.d_secpercyl = info.d_nheads * info.d_secpertrack;
1396 * For cases where a label is directly applied to the ccd,
1397 * without slices, DSO_COMPATMBR forces one sector be
1398 * reserved for backwards compatibility.
1400 info.d_dsflags = DSO_COMPATMBR;
1401 disk_setdiskinfo(&cs->sc_disk, &info);
1408 if ((cs->sc_flags & CCDF_INITED) == 0)
1411 if ((ap->a_fflag & FWRITE) == 0)
1414 if ((error = ccdlock(cs)) != 0)
1417 if (dev_drefs(cs->sc_dev) > 1) {
1423 * Free ccd_softc information and clear entry.
1426 /* Close the components and free their pathnames. */
1427 for (i = 0; i < cs->sc_nccdisks; ++i) {
1429 * XXX: this close could potentially fail and
1430 * cause Bad Things. Maybe we need to force
1431 * the close to happen?
1434 if (ccddebug & CCDB_VNODE)
1435 vprint("CCDIOCCLR: vnode info",
1436 cs->sc_cinfo[i].ci_vp);
1438 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE);
1439 kfree(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1442 /* Free interleave index. */
1443 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1444 kfree(cs->sc_itable[i].ii_index, M_DEVBUF);
1446 /* Free component info and interleave table. */
1447 kfree(cs->sc_cinfo, M_DEVBUF);
1448 kfree(cs->sc_itable, M_DEVBUF);
1449 cs->sc_cinfo = NULL;
1450 cs->sc_itable = NULL;
1451 cs->sc_flags &= ~CCDF_INITED;
1454 * Free ccddevice information and clear entry.
1456 kfree(ccddevs[unit].ccd_cpp, M_DEVBUF);
1457 kfree(ccddevs[unit].ccd_vpp, M_DEVBUF);
1458 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1461 * And remove the devstat entry.
1463 devstat_remove_entry(&cs->device_stats);
1465 /* This must be atomic. */
1480 ccddump(struct dev_dump_args *ap)
1482 /* Not implemented. */
1487 * Lookup the provided name in the filesystem. If the file exists,
1488 * is a valid block device, and isn't being used by anyone else,
1489 * set *vpp to the file's vnode.
1492 ccdlookup(char *path, struct vnode **vpp)
1494 struct nlookupdata nd;
1500 error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1503 if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1505 if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1506 kprintf("ccdlookup: vn_open error = %d\n", error);
1512 if (vp->v_opencount > 1) {
1517 if (!vn_isdisk(vp, &error))
1521 if (ccddebug & CCDB_VNODE)
1522 vprint("ccdlookup: vnode info", vp);
1526 nd.nl_open_vp = NULL;
1528 *vpp = vp; /* leave ref intact */
1536 * Wait interruptibly for an exclusive lock.
1539 * Several drivers do this; it should be abstracted and made MP-safe.
1542 ccdlock(struct ccd_softc *cs)
1546 while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1547 cs->sc_flags |= CCDF_WANTED;
1548 if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1551 cs->sc_flags |= CCDF_LOCKED;
1556 * Unlock and wake up any waiters.
1559 ccdunlock(struct ccd_softc *cs)
1562 cs->sc_flags &= ~CCDF_LOCKED;
1563 if ((cs->sc_flags & CCDF_WANTED) != 0) {
1564 cs->sc_flags &= ~CCDF_WANTED;
1571 printiinfo(struct ccdiinfo *ii)
1575 for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1576 kprintf(" itab[%d]: #dk %d sblk %d soff %d",
1577 ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1578 for (i = 0; i < ii->ii_ndisk; i++)
1579 kprintf(" %d", ii->ii_index[i]);
1586 /* Local Variables: */
1587 /* c-argdecl-indent: 8 */
1588 /* c-continued-statement-offset: 8 */
1589 /* c-indent-level: 8 */