2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * Copyright (c) 1995 Jason R. Thorpe.
37 * All rights reserved.
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. All advertising materials mentioning features or use of this software
48 * must display the following acknowledgement:
49 * This product includes software developed for the NetBSD Project
51 * 4. The name of the author may not be used to endorse or promote products
52 * derived from this software without specific prior written permission.
54 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
55 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
56 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
57 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
58 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
59 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
60 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
61 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
62 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * Copyright (c) 1988 University of Utah.
69 * Copyright (c) 1990, 1993
70 * The Regents of the University of California. All rights reserved.
72 * This code is derived from software contributed to Berkeley by
73 * the Systems Programming Group of the University of Utah Computer
76 * Redistribution and use in source and binary forms, with or without
77 * modification, are permitted provided that the following conditions
79 * 1. Redistributions of source code must retain the above copyright
80 * notice, this list of conditions and the following disclaimer.
81 * 2. Redistributions in binary form must reproduce the above copyright
82 * notice, this list of conditions and the following disclaimer in the
83 * documentation and/or other materials provided with the distribution.
84 * 3. All advertising materials mentioning features or use of this software
85 * must display the following acknowledgement:
86 * This product includes software developed by the University of
87 * California, Berkeley and its contributors.
88 * 4. Neither the name of the University nor the names of its contributors
89 * may be used to endorse or promote products derived from this software
90 * without specific prior written permission.
92 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
93 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
95 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
96 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
97 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
98 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
99 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
100 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
101 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
104 * from: Utah $Hdr: cd.c 1.6 90/11/28$
107 * @(#)cd.c 8.2 (Berkeley) 11/16/93
108 * $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $
109 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
113 * "Concatenated" disk driver.
115 * Original dynamic configuration support by:
116 * Jason R. Thorpe <thorpej@nas.nasa.gov>
117 * Numerical Aerodynamic Simulation Facility
119 * NASA Ames Research Center
120 * Moffett Field, CA 94035
125 #include <sys/param.h>
126 #include <sys/systm.h>
127 #include <sys/kernel.h>
128 #include <sys/module.h>
129 #include <sys/proc.h>
131 #include <sys/malloc.h>
132 #include <sys/nlookup.h>
133 #include <sys/conf.h>
134 #include <sys/stat.h>
135 #include <sys/sysctl.h>
136 #include <sys/disk.h>
137 #include <sys/dtype.h>
138 #include <sys/diskslice.h>
139 #include <sys/devicestat.h>
140 #include <sys/fcntl.h>
141 #include <sys/vnode.h>
142 #include <sys/ccdvar.h>
144 #include <vm/vm_zone.h>
146 #include <vfs/ufs/dinode.h> /* XXX Used only for fs.h */
147 #include <vfs/ufs/fs.h> /* XXX used only to get BBSIZE and SBSIZE */
149 #include <sys/thread2.h>
150 #include <sys/buf2.h>
151 #include <sys/mplock2.h>
153 #if defined(CCDDEBUG) && !defined(DEBUG)
158 #define CCDB_FOLLOW 0x01
159 #define CCDB_INIT 0x02
161 #define CCDB_LABEL 0x08
162 #define CCDB_VNODE 0x10
163 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
165 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
169 #define ccdunit(x) dkunit(x)
170 #define ccdpart(x) dkpart(x)
173 This is how mirroring works (only writes are special):
175 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
176 linked together by the cb_mirror field. "cb_pflags &
177 CCDPF_MIRROR_DONE" is set to 0 on both of them.
179 When a component returns to ccdiodone(), it checks if "cb_pflags &
180 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
181 flag and returns. If it is, it means its partner has already
182 returned, so it will go to the regular cleanup.
187 struct buf cb_buf; /* new I/O buf */
188 struct vnode *cb_vp; /* related vnode */
189 struct bio *cb_obio; /* ptr. to original I/O buf */
190 struct ccdbuf *cb_freenext; /* free list link */
191 int cb_unit; /* target unit */
192 int cb_comp; /* target component */
193 int cb_pflags; /* mirror/parity status flag */
194 struct ccdbuf *cb_mirror; /* mirror counterpart */
197 /* bits in cb_pflags */
198 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
200 static d_open_t ccdopen;
201 static d_close_t ccdclose;
202 static d_strategy_t ccdstrategy;
203 static d_ioctl_t ccdioctl;
204 static d_dump_t ccddump;
206 #define NCCDFREEHIWAT 16
208 static struct dev_ops ccd_ops = {
209 { "ccd", 0, D_DISK },
213 .d_write = physwrite,
215 .d_strategy = ccdstrategy,
219 /* called during module initialization */
220 static void ccdattach (void);
221 static int ccddetach (void);
222 static int ccd_modevent (module_t, int, void *);
224 /* called by biodone() at interrupt time */
225 static void ccdiodone (struct bio *bio);
227 static void ccdstart (struct ccd_softc *, struct bio *);
228 static void ccdinterleave (struct ccd_softc *, int);
229 static void ccdintr (struct ccd_softc *, struct bio *);
230 static int ccdinit (struct ccddevice *, char **, struct ucred *);
231 static int ccdlookup (char *, struct vnode **);
232 static void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
233 struct bio *, off_t, caddr_t, long);
234 static int ccdlock (struct ccd_softc *);
235 static void ccdunlock (struct ccd_softc *);
238 static void printiinfo (struct ccdiinfo *);
241 /* Non-private for the benefit of libkvm. */
242 struct ccd_softc *ccd_softc;
243 struct ccddevice *ccddevs;
244 struct ccdbuf *ccdfreebufs;
245 static int numccdfreebufs;
246 static int numccd = 0;
249 * getccdbuf() - Allocate and zero a ccd buffer.
251 * This routine is called at splbio().
261 * Allocate from freelist or malloc as necessary
263 if ((cbp = ccdfreebufs) != NULL) {
264 ccdfreebufs = cbp->cb_freenext;
266 reinitbufbio(&cbp->cb_buf);
268 cbp = kmalloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK|M_ZERO);
269 initbufbio(&cbp->cb_buf);
273 * independant struct buf initialization
275 buf_dep_init(&cbp->cb_buf);
276 BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
277 BUF_KERNPROC(&cbp->cb_buf);
278 cbp->cb_buf.b_flags = B_PAGING | B_BNOCLIP;
284 * putccdbuf() - Free a ccd buffer.
286 * This routine is called at splbio().
291 putccdbuf(struct ccdbuf *cbp)
293 BUF_UNLOCK(&cbp->cb_buf);
295 if (numccdfreebufs < NCCDFREEHIWAT) {
296 cbp->cb_freenext = ccdfreebufs;
300 uninitbufbio(&cbp->cb_buf);
301 kfree((caddr_t)cbp, M_DEVBUF);
306 * Called by main() during pseudo-device attachment. All we need
307 * to do is allocate enough space for devices to be configured later, and
313 struct disk_info info;
314 struct ccd_softc *cs;
319 kprintf("ccd0-%d: Concatenated disk drivers\n", num-1);
321 kprintf("ccd0: Concatenated disk driver\n");
323 ccd_softc = kmalloc(num * sizeof(struct ccd_softc), M_DEVBUF,
325 ccddevs = kmalloc(num * sizeof(struct ccddevice), M_DEVBUF,
330 * With normal disk devices the open simply fails if the media
331 * is not present. With CCD we have to be able to open the
332 * raw disk to use the ioctl's to set it up, so create a dummy
333 * disk info structure so dscheck() doesn't blow up.
335 bzero(&info, sizeof(info));
336 info.d_media_blksize = DEV_BSIZE;
338 for (i = 0; i < numccd; ++i) {
340 cs->sc_dev = disk_create(i, &cs->sc_disk, &ccd_ops);
341 cs->sc_dev->si_drv1 = cs;
342 cs->sc_dev->si_iosize_max = 256 * 512; /* XXX */
343 disk_setdiskinfo(&cs->sc_disk, &info);
350 struct ccd_softc *cs;
351 struct dev_ioctl_args ioctl_args;
356 bzero(&ioctl_args, sizeof(ioctl_args));
358 for (i = 0; i < numccd; ++i) {
360 if (cs->sc_dev == NULL)
362 ioctl_args.a_head.a_dev = cs->sc_dev;
363 ioctl_args.a_cmd = CCDIOCCLR;
364 ioctl_args.a_fflag = FWRITE;
365 eval = ccdioctl(&ioctl_args);
366 if (eval && eval != ENXIO) {
367 kprintf("ccd%d: In use, cannot detach\n", i);
372 for (i = 0; i < numccd; ++i) {
374 if (cs->sc_dev == NULL)
376 disk_destroy(&cs->sc_disk);
380 kfree(ccd_softc, M_DEVBUF);
382 kfree(ccddevs, M_DEVBUF);
388 ccd_modevent(module_t mod, int type, void *data)
401 default: /* MOD_SHUTDOWN etc */
407 DEV_MODULE(ccd, ccd_modevent, NULL);
410 ccdinit(struct ccddevice *ccd, char **cpaths, struct ucred *cred)
412 struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
413 struct ccdcinfo *ci = NULL; /* XXX */
420 struct partinfo dpart;
421 struct ccdgeom *ccg = &cs->sc_geom;
422 char tmppath[MAXPATHLEN];
426 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
427 kprintf("ccdinit: unit %d\n", ccd->ccd_unit);
431 cs->sc_ileave = ccd->ccd_interleave;
432 cs->sc_nccdisks = ccd->ccd_ndev;
434 /* Allocate space for the component info. */
435 cs->sc_cinfo = kmalloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
437 cs->sc_maxiosize = MAXPHYS;
440 * Verify that each component piece exists and record
441 * relevant information about it.
445 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
446 vp = ccd->ccd_vpp[ix];
447 ci = &cs->sc_cinfo[ix];
451 * Copy in the pathname of the component.
453 bzero(tmppath, sizeof(tmppath)); /* sanity */
454 if ((error = copyinstr(cpaths[ix], tmppath,
455 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
457 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
458 kprintf("ccd%d: can't copy path, error = %d\n",
459 ccd->ccd_unit, error);
463 ci->ci_path = kmalloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
464 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
466 ci->ci_dev = vn_todev(vp);
467 if (ci->ci_dev->si_iosize_max &&
468 cs->sc_maxiosize > ci->ci_dev->si_iosize_max) {
469 cs->sc_maxiosize = ci->ci_dev->si_iosize_max;
473 * Get partition information for the component.
475 error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart, FREAD,
479 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
480 kprintf("ccd%d: %s: ioctl failed, error = %d\n",
481 ccd->ccd_unit, ci->ci_path, error);
485 if (dpart.fstype != FS_CCD &&
486 !kuuid_is_ccd(&dpart.fstype_uuid)) {
487 kprintf("ccd%d: %s: filesystem type must be 'ccd'\n",
488 ccd->ccd_unit, ci->ci_path);
492 if (maxsecsize < dpart.media_blksize)
493 maxsecsize = dpart.media_blksize;
496 * Skip a certain amount of storage at the beginning of
497 * the component to make sure we don't infringe on any
498 * reserved sectors. This is handled entirely by
499 * dpart.reserved_blocks but we also impose a minimum
500 * of 16 sectors for backwards compatibility.
503 if (skip < dpart.reserved_blocks)
504 skip = dpart.reserved_blocks;
505 size = dpart.media_blocks - skip;
508 * Calculate the size, truncating to an interleave
509 * boundary if necessary.
511 if (cs->sc_ileave > 1)
512 size -= size % cs->sc_ileave;
514 if ((int64_t)size <= 0) {
516 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
517 kprintf("ccd%d: %s: size == 0\n",
518 ccd->ccd_unit, ci->ci_path);
525 * Calculate the smallest uniform component, used
528 if (minsize == 0 || minsize > size)
534 kprintf("ccd%d: max component iosize is %d total blocks %lld\n",
535 cs->sc_unit, cs->sc_maxiosize, (long long)cs->sc_size);
538 * Don't allow the interleave to be smaller than
539 * the biggest component sector.
541 if ((cs->sc_ileave > 0) &&
542 (cs->sc_ileave % (maxsecsize / DEV_BSIZE))) {
544 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
545 kprintf("ccd%d: interleave must be at least %d\n",
546 ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
553 * If uniform interleave is desired set all sizes to that of
554 * the smallest component. This will guarentee that a single
555 * interleave table is generated.
557 * Lost space must be taken into account when calculating the
558 * overall size. Half the space is lost when CCDF_MIRROR is
559 * specified. One disk is lost when CCDF_PARITY is specified.
561 if (ccd->ccd_flags & CCDF_UNIFORM) {
562 for (ci = cs->sc_cinfo;
563 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
564 ci->ci_size = minsize;
566 if (ccd->ccd_flags & CCDF_MIRROR) {
568 * Check to see if an even number of components
569 * have been specified. The interleave must also
570 * be non-zero in order for us to be able to
571 * guarentee the topology.
573 if (cs->sc_nccdisks % 2) {
574 kprintf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
578 if (cs->sc_ileave == 0) {
579 kprintf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
583 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
584 } else if (ccd->ccd_flags & CCDF_PARITY) {
585 cs->sc_size = (cs->sc_nccdisks-1) * minsize;
587 if (cs->sc_ileave == 0) {
588 kprintf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
592 cs->sc_size = cs->sc_nccdisks * minsize;
597 * Construct the interleave table.
599 ccdinterleave(cs, ccd->ccd_unit);
602 * Create pseudo-geometry based on 1MB cylinders. It's
605 ccg->ccg_secsize = maxsecsize;
606 ccg->ccg_ntracks = 1;
607 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
608 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
611 * Add an devstat entry for this device.
613 devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
614 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
615 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
616 DEVSTAT_PRIORITY_ARRAY);
618 cs->sc_flags |= CCDF_INITED;
619 cs->sc_cflags = ccd->ccd_flags; /* So we can find out later... */
620 cs->sc_unit = ccd->ccd_unit;
623 while (ci > cs->sc_cinfo) {
625 kfree(ci->ci_path, M_DEVBUF);
627 kfree(cs->sc_cinfo, M_DEVBUF);
633 ccdinterleave(struct ccd_softc *cs, int unit)
635 struct ccdcinfo *ci, *smallci;
644 if (ccddebug & CCDB_INIT)
645 kprintf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
649 * Allocate an interleave table. The worst case occurs when each
650 * of N disks is of a different size, resulting in N interleave
653 * Chances are this is too big, but we don't care.
655 icount = cs->sc_nccdisks + 1;
656 cs->sc_itable = kmalloc(icount * sizeof(struct ccdiinfo),
657 M_DEVBUF, M_WAITOK|M_ZERO);
660 * Trivial case: no interleave (actually interleave of disk size).
661 * Each table entry represents a single component in its entirety.
663 * An interleave of 0 may not be used with a mirror or parity setup.
665 if (cs->sc_ileave == 0) {
669 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
670 /* Allocate space for ii_index. */
671 ii->ii_index = kmalloc(sizeof(int), M_DEVBUF, M_WAITOK);
673 ii->ii_startblk = bn;
675 ii->ii_index[0] = ix;
676 bn += cs->sc_cinfo[ix].ci_size;
681 if (ccddebug & CCDB_INIT)
682 printiinfo(cs->sc_itable);
688 * The following isn't fast or pretty; it doesn't have to be.
692 for (ii = cs->sc_itable; ii < &cs->sc_itable[icount]; ++ii) {
694 * Allocate space for ii_index. We might allocate more then
697 ii->ii_index = kmalloc((sizeof(int) * cs->sc_nccdisks),
701 * Locate the smallest of the remaining components
705 while (ci < &cs->sc_cinfo[cs->sc_nccdisks]) {
706 if (ci->ci_size > size &&
708 ci->ci_size < smallci->ci_size)) {
715 * Nobody left, all done
717 if (smallci == NULL) {
723 * Record starting logical block using an sc_ileave blocksize.
725 ii->ii_startblk = bn / cs->sc_ileave;
728 * Record starting component block using an sc_ileave
729 * blocksize. This value is relative to the beginning of
732 ii->ii_startoff = lbn;
735 * Determine how many disks take part in this interleave
736 * and record their indices.
739 for (ci = cs->sc_cinfo;
740 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
741 if (ci->ci_size >= smallci->ci_size) {
742 ii->ii_index[ix++] = ci - cs->sc_cinfo;
750 bn += ix * (smallci->ci_size - size);
751 lbn = smallci->ci_size / cs->sc_ileave;
752 size = smallci->ci_size;
754 if (ii == &cs->sc_itable[icount])
755 panic("ccdinterlave software bug! table exhausted");
757 if (ccddebug & CCDB_INIT)
758 printiinfo(cs->sc_itable);
764 ccdopen(struct dev_open_args *ap)
766 cdev_t dev = ap->a_head.a_dev;
767 int unit = ccdunit(dev);
768 struct ccd_softc *cs;
772 if (ccddebug & CCDB_FOLLOW)
773 kprintf("ccdopen(%x, %x)\n", dev, flags);
777 cs = &ccd_softc[unit];
779 if ((error = ccdlock(cs)) == 0) {
787 ccdclose(struct dev_close_args *ap)
789 cdev_t dev = ap->a_head.a_dev;
790 int unit = ccdunit(dev);
791 struct ccd_softc *cs;
795 if (ccddebug & CCDB_FOLLOW)
796 kprintf("ccdclose(%x, %x)\n", dev, flags);
801 cs = &ccd_softc[unit];
802 if ((error = ccdlock(cs)) == 0) {
809 ccdstrategy(struct dev_strategy_args *ap)
811 cdev_t dev = ap->a_head.a_dev;
812 struct bio *bio = ap->a_bio;
813 int unit = ccdunit(dev);
815 struct buf *bp = bio->bio_buf;
816 struct ccd_softc *cs = &ccd_softc[unit];
817 u_int64_t pbn; /* in sc_secsize chunks */
818 u_int32_t sz; /* in sc_secsize chunks */
821 if (ccddebug & CCDB_FOLLOW)
822 kprintf("ccdstrategy(%x): unit %d\n", bp, unit);
824 if ((cs->sc_flags & CCDF_INITED) == 0) {
829 /* If it's a nil transfer, wake up the top half now. */
830 if (bp->b_bcount == 0) {
836 * Do bounds checking and adjust transfer. If there's an
837 * error, the bounds check will flag that for us.
840 pbn = bio->bio_offset / cs->sc_geom.ccg_secsize;
841 sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
844 * If out of bounds return an error. If the request goes
845 * past EOF, clip the request as appropriate. If exactly
846 * at EOF, return success (don't clip), but with 0 bytes
849 * Mark EOF B_INVAL (just like bad), indicating that the
850 * contents of the buffer, if any, is invalid.
852 if ((int64_t)pbn < 0)
854 if (pbn + sz > cs->sc_size) {
855 if (pbn > cs->sc_size || (bp->b_flags & B_BNOCLIP))
857 if (pbn == cs->sc_size) {
858 bp->b_resid = bp->b_bcount;
859 bp->b_flags |= B_INVAL;
862 sz = (long)(cs->sc_size - pbn);
863 bp->b_bcount = sz * cs->sc_geom.ccg_secsize;
867 bp->b_resid = bp->b_bcount;
868 nbio->bio_driver_info = dev;
879 * note: bio, not nbio, is valid at the done label.
882 bp->b_error = EINVAL;
884 bp->b_resid = bp->b_bcount;
885 bp->b_flags |= B_ERROR | B_INVAL;
892 ccdstart(struct ccd_softc *cs, struct bio *bio)
895 struct ccdbuf *cbp[4];
896 struct buf *bp = bio->bio_buf;
897 /* XXX! : 2 reads and 2 writes for RAID 4/5 */
902 if (ccddebug & CCDB_FOLLOW)
903 kprintf("ccdstart(%x, %x)\n", cs, bp);
906 /* Record the transaction start */
907 devstat_start_transaction(&cs->device_stats);
910 * Allocate component buffers and fire off the requests
912 doffset = bio->bio_offset;
915 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
916 ccdbuffer(cbp, cs, bio, doffset, addr, bcount);
917 rcount = cbp[0]->cb_buf.b_bcount;
919 if (cs->sc_cflags & CCDF_MIRROR) {
921 * Mirroring. Writes go to both disks, reads are
922 * taken from whichever disk seems most appropriate.
924 * We attempt to localize reads to the disk whos arm
925 * is nearest the read request. We ignore seeks due
926 * to writes when making this determination and we
927 * also try to avoid hogging.
929 if (cbp[0]->cb_buf.b_cmd != BUF_CMD_READ) {
930 vn_strategy(cbp[0]->cb_vp,
931 &cbp[0]->cb_buf.b_bio1);
932 vn_strategy(cbp[1]->cb_vp,
933 &cbp[1]->cb_buf.b_bio1);
935 int pick = cs->sc_pick;
936 daddr_t range = cs->sc_size / 16 * cs->sc_geom.ccg_secsize;
937 if (doffset < cs->sc_blk[pick] - range ||
938 doffset > cs->sc_blk[pick] + range
940 cs->sc_pick = pick = 1 - pick;
942 cs->sc_blk[pick] = doffset + rcount;
943 vn_strategy(cbp[pick]->cb_vp,
944 &cbp[pick]->cb_buf.b_bio1);
950 vn_strategy(cbp[0]->cb_vp,
951 &cbp[0]->cb_buf.b_bio1);
959 * Build a component buffer header.
962 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct bio *bio,
963 off_t doffset, caddr_t addr, long bcount)
965 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
973 if (ccddebug & CCDB_IO)
974 kprintf("ccdbuffer(%x, %x, %d, %x, %d)\n",
975 cs, bp, bn, addr, bcount);
978 * Determine which component bn falls in.
980 bn = doffset / cs->sc_geom.ccg_secsize;
984 if (cs->sc_ileave == 0) {
986 * Serially concatenated and neither a mirror nor a parity
987 * config. This is a special case.
992 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
1000 * Calculate cbn, the logical superblock (sc_ileave chunks),
1001 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
1004 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
1005 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
1008 * Figure out which interleave table to use.
1010 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
1011 if (ii->ii_startblk > cbn)
1017 * off is the logical superblock relative to the beginning
1018 * of this interleave block.
1020 off = cbn - ii->ii_startblk;
1023 * We must calculate which disk component to use (ccdisk),
1024 * and recalculate cbn to be the superblock relative to
1025 * the beginning of the component. This is typically done by
1026 * adding 'off' and ii->ii_startoff together. However, 'off'
1027 * must typically be divided by the number of components in
1028 * this interleave array to be properly convert it from a
1029 * CCD-relative logical superblock number to a
1030 * component-relative superblock number.
1032 if (ii->ii_ndisk == 1) {
1034 * When we have just one disk, it can't be a mirror
1035 * or a parity config.
1037 ccdisk = ii->ii_index[0];
1038 cbn = ii->ii_startoff + off;
1040 if (cs->sc_cflags & CCDF_MIRROR) {
1042 * We have forced a uniform mapping, resulting
1043 * in a single interleave array. We double
1044 * up on the first half of the available
1045 * components and our mirror is in the second
1046 * half. This only works with a single
1047 * interleave array because doubling up
1048 * doubles the number of sectors, so there
1049 * cannot be another interleave array because
1050 * the next interleave array's calculations
1053 int ndisk2 = ii->ii_ndisk / 2;
1054 ccdisk = ii->ii_index[off % ndisk2];
1055 cbn = ii->ii_startoff + off / ndisk2;
1056 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1057 } else if (cs->sc_cflags & CCDF_PARITY) {
1059 * XXX not implemented yet
1061 int ndisk2 = ii->ii_ndisk - 1;
1062 ccdisk = ii->ii_index[off % ndisk2];
1063 cbn = ii->ii_startoff + off / ndisk2;
1064 if (cbn % ii->ii_ndisk <= ccdisk)
1067 ccdisk = ii->ii_index[off % ii->ii_ndisk];
1068 cbn = ii->ii_startoff + off / ii->ii_ndisk;
1072 ci = &cs->sc_cinfo[ccdisk];
1075 * Convert cbn from a superblock to a normal block so it
1076 * can be used to calculate (along with cboff) the normal
1077 * block index into this particular disk.
1079 cbn *= cs->sc_ileave;
1083 * Fill in the component buf structure.
1085 * NOTE: devices do not use b_bufsize, only b_bcount, but b_bcount
1086 * will be truncated on device EOF so we use b_bufsize to detect
1090 cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1091 cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1092 cbp->cb_buf.b_data = addr;
1093 cbp->cb_vp = ci->ci_vp;
1094 if (cs->sc_ileave == 0)
1095 cbc = dbtob((off_t)(ci->ci_size - cbn));
1097 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1098 if (cbc > cs->sc_maxiosize)
1099 cbc = cs->sc_maxiosize;
1100 cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1101 cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1103 cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1104 cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1105 cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci->ci_skip);
1108 * context for ccdiodone
1111 cbp->cb_unit = cs - ccd_softc;
1112 cbp->cb_comp = ci - cs->sc_cinfo;
1115 if (ccddebug & CCDB_IO)
1116 kprintf(" dev %x(u%d): cbp %x off %lld addr %x bcnt %d\n",
1117 ci->ci_dev, ci-cs->sc_cinfo, cbp,
1118 cbp->cb_buf.b_bio1.bio_offset,
1119 cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1124 * Note: both I/O's setup when reading from mirror, but only one
1127 if (cs->sc_cflags & CCDF_MIRROR) {
1128 /* mirror, setup second I/O */
1131 cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1132 cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1133 cbp->cb_buf.b_data = addr;
1134 cbp->cb_vp = ci2->ci_vp;
1135 if (cs->sc_ileave == 0)
1136 cbc = dbtob((off_t)(ci->ci_size - cbn));
1138 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1139 if (cbc > cs->sc_maxiosize)
1140 cbc = cs->sc_maxiosize;
1141 cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1142 cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1144 cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1145 cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1146 cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci2->ci_skip);
1149 * context for ccdiodone
1152 cbp->cb_unit = cs - ccd_softc;
1153 cbp->cb_comp = ci2 - cs->sc_cinfo;
1155 /* link together the ccdbuf's and clear "mirror done" flag */
1156 cb[0]->cb_mirror = cb[1];
1157 cb[1]->cb_mirror = cb[0];
1158 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1159 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1164 ccdintr(struct ccd_softc *cs, struct bio *bio)
1166 struct buf *bp = bio->bio_buf;
1169 if (ccddebug & CCDB_FOLLOW)
1170 kprintf("ccdintr(%x, %x)\n", cs, bp);
1173 * Request is done for better or worse, wakeup the top half.
1175 if (bp->b_flags & B_ERROR)
1176 bp->b_resid = bp->b_bcount;
1177 devstat_end_transaction_buf(&cs->device_stats, bp);
1182 * Called at interrupt time.
1184 * Mark the component as done and if all components are done,
1185 * take a ccd interrupt.
1188 ccdiodone(struct bio *bio)
1190 struct ccdbuf *cbp = bio->bio_caller_info1.ptr;
1191 struct bio *obio = cbp->cb_obio;
1192 struct buf *obp = obio->bio_buf;
1193 int unit = cbp->cb_unit;
1197 * Since we do not have exclusive access to underlying devices,
1198 * we can't keep cache translations around.
1200 clearbiocache(bio->bio_next);
1205 if (ccddebug & CCDB_FOLLOW)
1206 kprintf("ccdiodone(%x)\n", cbp);
1207 if (ccddebug & CCDB_IO) {
1208 kprintf("ccdiodone: bp %x bcount %d resid %d\n",
1209 obp, obp->b_bcount, obp->b_resid);
1210 kprintf(" dev %x(u%d), cbp %x off %lld addr %x bcnt %d\n",
1211 cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1212 cbp->cb_buf.b_loffset, cbp->cb_buf.b_data,
1213 cbp->cb_buf.b_bcount);
1218 * If an error occured, report it. If this is a mirrored
1219 * configuration and the first of two possible reads, do not
1220 * set the error in the bp yet because the second read may
1223 if (cbp->cb_buf.b_flags & B_ERROR) {
1224 const char *msg = "";
1226 if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1227 (cbp->cb_buf.b_cmd == BUF_CMD_READ) &&
1228 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1230 * We will try our read on the other disk down
1231 * below, also reverse the default pick so if we
1232 * are doing a scan we do not keep hitting the
1235 struct ccd_softc *cs = &ccd_softc[unit];
1237 msg = ", trying other disk";
1238 cs->sc_pick = 1 - cs->sc_pick;
1239 cs->sc_blk[cs->sc_pick] = obio->bio_offset;
1241 obp->b_flags |= B_ERROR;
1242 obp->b_error = cbp->cb_buf.b_error ?
1243 cbp->cb_buf.b_error : EIO;
1245 kprintf("ccd%d: error %d on component %d "
1246 "offset %jd (ccd offset %jd)%s\n",
1247 unit, obp->b_error, cbp->cb_comp,
1248 (intmax_t)cbp->cb_buf.b_bio2.bio_offset,
1249 (intmax_t)obio->bio_offset,
1254 * Process mirror. If we are writing, I/O has been initiated on both
1255 * buffers and we fall through only after both are finished.
1257 * If we are reading only one I/O is initiated at a time. If an
1258 * error occurs we initiate the second I/O and return, otherwise
1259 * we free the second I/O without initiating it.
1262 if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1263 if (cbp->cb_buf.b_cmd != BUF_CMD_READ) {
1265 * When writing, handshake with the second buffer
1266 * to determine when both are done. If both are not
1267 * done, return here.
1269 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1270 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1278 * When reading, either dispose of the second buffer
1279 * or initiate I/O on the second buffer if an error
1280 * occured with this one.
1282 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1283 if (cbp->cb_buf.b_flags & B_ERROR) {
1284 cbp->cb_mirror->cb_pflags |=
1287 cbp->cb_mirror->cb_vp,
1288 &cbp->cb_mirror->cb_buf.b_bio1
1295 putccdbuf(cbp->cb_mirror);
1303 * Use our saved b_bufsize to determine if an unexpected EOF occured.
1305 count = cbp->cb_buf.b_bufsize;
1309 * If all done, "interrupt".
1311 obp->b_resid -= count;
1312 if (obp->b_resid < 0)
1313 panic("ccdiodone: count");
1314 if (obp->b_resid == 0)
1315 ccdintr(&ccd_softc[unit], obio);
1321 ccdioctl(struct dev_ioctl_args *ap)
1323 cdev_t dev = ap->a_head.a_dev;
1324 int unit = ccdunit(dev);
1325 int i, j, lookedup = 0, error = 0;
1326 struct ccd_softc *cs;
1327 struct ccd_ioctl *ccio = (struct ccd_ioctl *)ap->a_data;
1328 struct ccddevice ccd;
1329 struct disk_info info;
1335 cs = &ccd_softc[unit];
1337 bzero(&ccd, sizeof(ccd));
1339 switch (ap->a_cmd) {
1341 if (cs->sc_flags & CCDF_INITED)
1344 if ((ap->a_fflag & FWRITE) == 0)
1347 if ((error = ccdlock(cs)) != 0)
1350 if (ccio->ccio_ndisks > CCD_MAXNDISKS) {
1355 /* Fill in some important bits. */
1356 ccd.ccd_unit = unit;
1357 ccd.ccd_interleave = ccio->ccio_ileave;
1358 if (ccd.ccd_interleave == 0 &&
1359 ((ccio->ccio_flags & CCDF_MIRROR) ||
1360 (ccio->ccio_flags & CCDF_PARITY))) {
1361 kprintf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1362 ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1364 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1365 (ccio->ccio_flags & CCDF_PARITY)) {
1366 kprintf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1367 ccio->ccio_flags &= ~CCDF_PARITY;
1369 if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1370 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1371 kprintf("ccd%d: mirror/parity forces uniform flag\n",
1373 ccio->ccio_flags |= CCDF_UNIFORM;
1375 ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1378 * Allocate space for and copy in the array of
1379 * componet pathnames and device numbers.
1381 cpp = kmalloc(ccio->ccio_ndisks * sizeof(char *),
1382 M_DEVBUF, M_WAITOK);
1383 vpp = kmalloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1384 M_DEVBUF, M_WAITOK);
1386 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1387 ccio->ccio_ndisks * sizeof(char **));
1389 kfree(vpp, M_DEVBUF);
1390 kfree(cpp, M_DEVBUF);
1396 if (ccddebug & CCDB_INIT) {
1397 for (i = 0; i < ccio->ccio_ndisks; ++i)
1398 kprintf("ccdioctl: component %d: 0x%x\n",
1403 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1405 if (ccddebug & CCDB_INIT)
1406 kprintf("ccdioctl: lookedup = %d\n", lookedup);
1408 if ((error = ccdlookup(cpp[i], &vpp[i])) != 0) {
1409 for (j = 0; j < lookedup; ++j)
1410 (void)vn_close(vpp[j], FREAD|FWRITE);
1411 kfree(vpp, M_DEVBUF);
1412 kfree(cpp, M_DEVBUF);
1420 ccd.ccd_ndev = ccio->ccio_ndisks;
1423 * Initialize the ccd. Fills in the softc for us.
1425 if ((error = ccdinit(&ccd, cpp, ap->a_cred)) != 0) {
1426 for (j = 0; j < lookedup; ++j)
1427 (void)vn_close(vpp[j], FREAD|FWRITE);
1428 kfree(vpp, M_DEVBUF);
1429 kfree(cpp, M_DEVBUF);
1435 * The ccd has been successfully initialized, so
1436 * we can place it into the array and read the disklabel.
1438 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1439 ccio->ccio_unit = unit;
1440 ccio->ccio_size = cs->sc_size;
1442 bzero(&info, sizeof(info));
1443 info.d_media_blksize = cs->sc_geom.ccg_secsize;
1444 info.d_media_blocks = cs->sc_size;
1445 info.d_nheads = cs->sc_geom.ccg_ntracks;
1446 info.d_secpertrack = cs->sc_geom.ccg_nsectors;
1447 info.d_ncylinders = cs->sc_geom.ccg_ncylinders;
1448 info.d_secpercyl = info.d_nheads * info.d_secpertrack;
1451 * For cases where a label is directly applied to the ccd,
1452 * without slices, DSO_COMPATMBR forces one sector be
1453 * reserved for backwards compatibility.
1455 info.d_dsflags = DSO_COMPATMBR;
1456 disk_setdiskinfo(&cs->sc_disk, &info);
1463 if ((cs->sc_flags & CCDF_INITED) == 0)
1466 if ((ap->a_fflag & FWRITE) == 0)
1469 if ((error = ccdlock(cs)) != 0)
1472 if (dev_drefs(cs->sc_dev) > 1) {
1478 * Free ccd_softc information and clear entry.
1481 /* Close the components and free their pathnames. */
1482 for (i = 0; i < cs->sc_nccdisks; ++i) {
1484 * XXX: this close could potentially fail and
1485 * cause Bad Things. Maybe we need to force
1486 * the close to happen?
1489 if (ccddebug & CCDB_VNODE)
1490 vprint("CCDIOCCLR: vnode info",
1491 cs->sc_cinfo[i].ci_vp);
1493 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE);
1494 kfree(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1497 /* Free interleave index. */
1498 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1499 kfree(cs->sc_itable[i].ii_index, M_DEVBUF);
1501 /* Free component info and interleave table. */
1502 kfree(cs->sc_cinfo, M_DEVBUF);
1503 kfree(cs->sc_itable, M_DEVBUF);
1504 cs->sc_cinfo = NULL;
1505 cs->sc_itable = NULL;
1506 cs->sc_flags &= ~CCDF_INITED;
1509 * Free ccddevice information and clear entry.
1511 kfree(ccddevs[unit].ccd_cpp, M_DEVBUF);
1512 kfree(ccddevs[unit].ccd_vpp, M_DEVBUF);
1513 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1516 * And remove the devstat entry.
1518 devstat_remove_entry(&cs->device_stats);
1520 /* This must be atomic. */
1535 ccddump(struct dev_dump_args *ap)
1537 /* Not implemented. */
1542 * Lookup the provided name in the filesystem. If the file exists,
1543 * is a valid block device, and isn't being used by anyone else,
1544 * set *vpp to the file's vnode.
1547 ccdlookup(char *path, struct vnode **vpp)
1549 struct nlookupdata nd;
1555 error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1558 if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1560 if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1561 kprintf("ccdlookup: vn_open error = %d\n", error);
1567 if (vp->v_opencount > 1) {
1572 if (!vn_isdisk(vp, &error))
1576 if (ccddebug & CCDB_VNODE)
1577 vprint("ccdlookup: vnode info", vp);
1581 nd.nl_open_vp = NULL;
1583 *vpp = vp; /* leave ref intact */
1591 * Wait interruptibly for an exclusive lock.
1594 * Several drivers do this; it should be abstracted and made MP-safe.
1597 ccdlock(struct ccd_softc *cs)
1601 while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1602 cs->sc_flags |= CCDF_WANTED;
1603 if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1606 cs->sc_flags |= CCDF_LOCKED;
1611 * Unlock and wake up any waiters.
1614 ccdunlock(struct ccd_softc *cs)
1617 cs->sc_flags &= ~CCDF_LOCKED;
1618 if ((cs->sc_flags & CCDF_WANTED) != 0) {
1619 cs->sc_flags &= ~CCDF_WANTED;
1626 printiinfo(struct ccdiinfo *ii)
1630 for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1631 kprintf(" itab[%d]: #dk %d sblk %d soff %d",
1632 ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1633 for (i = 0; i < ii->ii_ndisk; i++)
1634 kprintf(" %d", ii->ii_index[i]);
1641 /* Local Variables: */
1642 /* c-argdecl-indent: 8 */
1643 /* c-continued-statement-offset: 8 */
1644 /* c-indent-level: 8 */