1 /* $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $ */
3 /* $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ */
6 * Copyright (c) 1995 Jason R. Thorpe.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed for the NetBSD Project
21 * 4. The name of the author may not be used to endorse or promote products
22 * derived from this software without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * Copyright (c) 1988 University of Utah.
39 * Copyright (c) 1990, 1993
40 * The Regents of the University of California. All rights reserved.
42 * This code is derived from software contributed to Berkeley by
43 * the Systems Programming Group of the University of Utah Computer
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions
49 * 1. Redistributions of source code must retain the above copyright
50 * notice, this list of conditions and the following disclaimer.
51 * 2. Redistributions in binary form must reproduce the above copyright
52 * notice, this list of conditions and the following disclaimer in the
53 * documentation and/or other materials provided with the distribution.
54 * 3. All advertising materials mentioning features or use of this software
55 * must display the following acknowledgement:
56 * This product includes software developed by the University of
57 * California, Berkeley and its contributors.
58 * 4. Neither the name of the University nor the names of its contributors
59 * may be used to endorse or promote products derived from this software
60 * without specific prior written permission.
62 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * from: Utah $Hdr: cd.c 1.6 90/11/28$
76 * @(#)cd.c 8.2 (Berkeley) 11/16/93
80 * "Concatenated" disk driver.
82 * Dynamic configuration and disklabel support by:
83 * Jason R. Thorpe <thorpej@nas.nasa.gov>
84 * Numerical Aerodynamic Simulation Facility
86 * NASA Ames Research Center
87 * Moffett Field, CA 94035
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/kernel.h>
95 #include <sys/module.h>
98 #include <sys/malloc.h>
99 #include <sys/namei.h>
100 #include <sys/conf.h>
101 #include <sys/stat.h>
102 #include <sys/sysctl.h>
103 #include <sys/disklabel.h>
104 #include <ufs/ffs/fs.h>
105 #include <sys/devicestat.h>
106 #include <sys/fcntl.h>
107 #include <sys/vnode.h>
109 #include <sys/ccdvar.h>
111 #include <vm/vm_zone.h>
113 #if defined(CCDDEBUG) && !defined(DEBUG)
118 #define CCDB_FOLLOW 0x01
119 #define CCDB_INIT 0x02
121 #define CCDB_LABEL 0x08
122 #define CCDB_VNODE 0x10
123 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
125 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
129 #define ccdunit(x) dkunit(x)
130 #define ccdpart(x) dkpart(x)
133 This is how mirroring works (only writes are special):
135 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
136 linked together by the cb_mirror field. "cb_pflags &
137 CCDPF_MIRROR_DONE" is set to 0 on both of them.
139 When a component returns to ccdiodone(), it checks if "cb_pflags &
140 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
141 flag and returns. If it is, it means its partner has already
142 returned, so it will go to the regular cleanup.
147 struct buf cb_buf; /* new I/O buf */
148 struct buf *cb_obp; /* ptr. to original I/O buf */
149 struct ccdbuf *cb_freenext; /* free list link */
150 int cb_unit; /* target unit */
151 int cb_comp; /* target component */
152 int cb_pflags; /* mirror/parity status flag */
153 struct ccdbuf *cb_mirror; /* mirror counterpart */
156 /* bits in cb_pflags */
157 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
159 #define CCDLABELDEV(dev) \
160 (makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
162 static d_open_t ccdopen;
163 static d_close_t ccdclose;
164 static d_strategy_t ccdstrategy;
165 static d_ioctl_t ccdioctl;
166 static d_dump_t ccddump;
167 static d_psize_t ccdsize;
169 #define NCCDFREEHIWAT 16
171 #define CDEV_MAJOR 74
172 #define BDEV_MAJOR 21
174 static struct cdevsw ccd_cdevsw = {
176 /* close */ ccdclose,
178 /* write */ physwrite,
179 /* ioctl */ ccdioctl,
182 /* strategy */ ccdstrategy,
184 /* maj */ CDEV_MAJOR,
188 /* bmaj */ BDEV_MAJOR
191 /* called during module initialization */
192 static void ccdattach __P((void));
193 static int ccd_modevent __P((module_t, int, void *));
195 /* called by biodone() at interrupt time */
196 static void ccdiodone __P((struct ccdbuf *cbp));
198 static void ccdstart __P((struct ccd_softc *, struct buf *));
199 static void ccdinterleave __P((struct ccd_softc *, int));
200 static void ccdintr __P((struct ccd_softc *, struct buf *));
201 static int ccdinit __P((struct ccddevice *, char **, struct proc *));
202 static int ccdlookup __P((char *, struct proc *p, struct vnode **));
203 static void ccdbuffer __P((struct ccdbuf **ret, struct ccd_softc *,
204 struct buf *, daddr_t, caddr_t, long));
205 static void ccdgetdisklabel __P((dev_t));
206 static void ccdmakedisklabel __P((struct ccd_softc *));
207 static int ccdlock __P((struct ccd_softc *));
208 static void ccdunlock __P((struct ccd_softc *));
211 static void printiinfo __P((struct ccdiinfo *));
214 /* Non-private for the benefit of libkvm. */
215 struct ccd_softc *ccd_softc;
216 struct ccddevice *ccddevs;
217 struct ccdbuf *ccdfreebufs;
218 static int numccdfreebufs;
219 static int numccd = 0;
222 * getccdbuf() - Allocate and zero a ccd buffer.
224 * This routine is called at splbio().
229 getccdbuf(struct ccdbuf *cpy)
234 * Allocate from freelist or malloc as necessary
236 if ((cbp = ccdfreebufs) != NULL) {
237 ccdfreebufs = cbp->cb_freenext;
240 cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
244 * Used by mirroring code
247 bcopy(cpy, cbp, sizeof(struct ccdbuf));
249 bzero(cbp, sizeof(struct ccdbuf));
252 * independant struct buf initialization
254 LIST_INIT(&cbp->cb_buf.b_dep);
255 BUF_LOCKINIT(&cbp->cb_buf);
256 BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
257 BUF_KERNPROC(&cbp->cb_buf);
263 * putccdbuf() - Free a ccd buffer.
265 * This routine is called at splbio().
270 putccdbuf(struct ccdbuf *cbp)
272 BUF_UNLOCK(&cbp->cb_buf);
273 BUF_LOCKFREE(&cbp->cb_buf);
275 if (numccdfreebufs < NCCDFREEHIWAT) {
276 cbp->cb_freenext = ccdfreebufs;
280 free((caddr_t)cbp, M_DEVBUF);
286 * Number of blocks to untouched in front of a component partition.
287 * This is to avoid violating its disklabel area when it starts at the
288 * beginning of the slice.
290 #if !defined(CCD_OFFSET)
291 #define CCD_OFFSET 16
295 * Called by main() during pseudo-device attachment. All we need
296 * to do is allocate enough space for devices to be configured later, and
306 printf("ccd0-%d: Concatenated disk drivers\n", num-1);
308 printf("ccd0: Concatenated disk driver\n");
310 ccd_softc = (struct ccd_softc *)malloc(num * sizeof(struct ccd_softc),
312 ccddevs = (struct ccddevice *)malloc(num * sizeof(struct ccddevice),
314 if ((ccd_softc == NULL) || (ccddevs == NULL)) {
315 printf("WARNING: no memory for concatenated disks\n");
316 if (ccd_softc != NULL)
317 free(ccd_softc, M_DEVBUF);
319 free(ccddevs, M_DEVBUF);
323 bzero(ccd_softc, num * sizeof(struct ccd_softc));
324 bzero(ccddevs, num * sizeof(struct ccddevice));
326 cdevsw_add(&ccd_cdevsw);
327 /* XXX: is this necessary? */
328 for (i = 0; i < numccd; ++i)
329 ccddevs[i].ccd_dk = -1;
333 ccd_modevent(mod, type, data)
346 printf("ccd0: Unload not supported!\n");
350 default: /* MOD_SHUTDOWN etc */
356 DEV_MODULE(ccd, ccd_modevent, NULL);
359 ccdinit(ccd, cpaths, p)
360 struct ccddevice *ccd;
364 struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
365 struct ccdcinfo *ci = NULL; /* XXX */
371 struct partinfo dpart;
372 struct ccdgeom *ccg = &cs->sc_geom;
373 char tmppath[MAXPATHLEN];
377 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
378 printf("ccdinit: unit %d\n", ccd->ccd_unit);
382 cs->sc_ileave = ccd->ccd_interleave;
383 cs->sc_nccdisks = ccd->ccd_ndev;
385 /* Allocate space for the component info. */
386 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
390 * Verify that each component piece exists and record
391 * relevant information about it.
395 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
396 vp = ccd->ccd_vpp[ix];
397 ci = &cs->sc_cinfo[ix];
401 * Copy in the pathname of the component.
403 bzero(tmppath, sizeof(tmppath)); /* sanity */
404 if ((error = copyinstr(cpaths[ix], tmppath,
405 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
407 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
408 printf("ccd%d: can't copy path, error = %d\n",
409 ccd->ccd_unit, error);
413 ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
414 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
416 ci->ci_dev = vn_todev(vp);
419 * Get partition information for the component.
421 if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
422 FREAD, p->p_ucred, p)) != 0) {
424 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
425 printf("ccd%d: %s: ioctl failed, error = %d\n",
426 ccd->ccd_unit, ci->ci_path, error);
430 if (dpart.part->p_fstype == FS_BSDFFS) {
432 ((dpart.disklab->d_secsize > maxsecsize) ?
433 dpart.disklab->d_secsize : maxsecsize);
434 size = dpart.part->p_size - CCD_OFFSET;
437 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
438 printf("ccd%d: %s: incorrect partition type\n",
439 ccd->ccd_unit, ci->ci_path);
446 * Calculate the size, truncating to an interleave
447 * boundary if necessary.
450 if (cs->sc_ileave > 1)
451 size -= size % cs->sc_ileave;
455 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
456 printf("ccd%d: %s: size == 0\n",
457 ccd->ccd_unit, ci->ci_path);
463 if (minsize == 0 || size < minsize)
470 * Don't allow the interleave to be smaller than
471 * the biggest component sector.
473 if ((cs->sc_ileave > 0) &&
474 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
476 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
477 printf("ccd%d: interleave must be at least %d\n",
478 ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
485 * If uniform interleave is desired set all sizes to that of
486 * the smallest component. This will guarentee that a single
487 * interleave table is generated.
489 * Lost space must be taken into account when calculating the
490 * overall size. Half the space is lost when CCDF_MIRROR is
491 * specified. One disk is lost when CCDF_PARITY is specified.
493 if (ccd->ccd_flags & CCDF_UNIFORM) {
494 for (ci = cs->sc_cinfo;
495 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
496 ci->ci_size = minsize;
498 if (ccd->ccd_flags & CCDF_MIRROR) {
500 * Check to see if an even number of components
501 * have been specified. The interleave must also
502 * be non-zero in order for us to be able to
503 * guarentee the topology.
505 if (cs->sc_nccdisks % 2) {
506 printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
510 if (cs->sc_ileave == 0) {
511 printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
515 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
516 } else if (ccd->ccd_flags & CCDF_PARITY) {
517 cs->sc_size = (cs->sc_nccdisks-1) * minsize;
519 if (cs->sc_ileave == 0) {
520 printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
524 cs->sc_size = cs->sc_nccdisks * minsize;
529 * Construct the interleave table.
531 ccdinterleave(cs, ccd->ccd_unit);
534 * Create pseudo-geometry based on 1MB cylinders. It's
537 ccg->ccg_secsize = maxsecsize;
538 ccg->ccg_ntracks = 1;
539 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
540 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
543 * Add an devstat entry for this device.
545 devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
546 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
547 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
548 DEVSTAT_PRIORITY_ARRAY);
550 cs->sc_flags |= CCDF_INITED;
551 cs->sc_cflags = ccd->ccd_flags; /* So we can find out later... */
552 cs->sc_unit = ccd->ccd_unit;
555 while (ci > cs->sc_cinfo) {
557 free(ci->ci_path, M_DEVBUF);
559 free(cs->sc_cinfo, M_DEVBUF);
564 ccdinterleave(cs, unit)
565 struct ccd_softc *cs;
568 struct ccdcinfo *ci, *smallci;
575 if (ccddebug & CCDB_INIT)
576 printf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
580 * Allocate an interleave table. The worst case occurs when each
581 * of N disks is of a different size, resulting in N interleave
584 * Chances are this is too big, but we don't care.
586 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
587 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF, M_WAITOK);
588 bzero((caddr_t)cs->sc_itable, size);
591 * Trivial case: no interleave (actually interleave of disk size).
592 * Each table entry represents a single component in its entirety.
594 * An interleave of 0 may not be used with a mirror or parity setup.
596 if (cs->sc_ileave == 0) {
600 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
601 /* Allocate space for ii_index. */
602 ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
604 ii->ii_startblk = bn;
606 ii->ii_index[0] = ix;
607 bn += cs->sc_cinfo[ix].ci_size;
612 if (ccddebug & CCDB_INIT)
613 printiinfo(cs->sc_itable);
619 * The following isn't fast or pretty; it doesn't have to be.
623 for (ii = cs->sc_itable; ; ii++) {
625 * Allocate space for ii_index. We might allocate more then
628 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
632 * Locate the smallest of the remaining components
635 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
637 if (ci->ci_size > size &&
639 ci->ci_size < smallci->ci_size)) {
645 * Nobody left, all done
647 if (smallci == NULL) {
653 * Record starting logical block using an sc_ileave blocksize.
655 ii->ii_startblk = bn / cs->sc_ileave;
658 * Record starting comopnent block using an sc_ileave
659 * blocksize. This value is relative to the beginning of
662 ii->ii_startoff = lbn;
665 * Determine how many disks take part in this interleave
666 * and record their indices.
669 for (ci = cs->sc_cinfo;
670 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
671 if (ci->ci_size >= smallci->ci_size) {
672 ii->ii_index[ix++] = ci - cs->sc_cinfo;
676 bn += ix * (smallci->ci_size - size);
677 lbn = smallci->ci_size / cs->sc_ileave;
678 size = smallci->ci_size;
681 if (ccddebug & CCDB_INIT)
682 printiinfo(cs->sc_itable);
688 ccdopen(dev, flags, fmt, p)
693 int unit = ccdunit(dev);
694 struct ccd_softc *cs;
695 struct disklabel *lp;
696 int error = 0, part, pmask;
699 if (ccddebug & CCDB_FOLLOW)
700 printf("ccdopen(%x, %x)\n", dev, flags);
704 cs = &ccd_softc[unit];
706 if ((error = ccdlock(cs)) != 0)
715 * If we're initialized, check to see if there are any other
716 * open partitions. If not, then it's safe to update
717 * the in-core disklabel.
719 if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
720 ccdgetdisklabel(dev);
722 /* Check that the partition exists. */
723 if (part != RAW_PART && ((part >= lp->d_npartitions) ||
724 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
729 cs->sc_openmask |= pmask;
737 ccdclose(dev, flags, fmt, p)
742 int unit = ccdunit(dev);
743 struct ccd_softc *cs;
747 if (ccddebug & CCDB_FOLLOW)
748 printf("ccdclose(%x, %x)\n", dev, flags);
753 cs = &ccd_softc[unit];
755 if ((error = ccdlock(cs)) != 0)
760 /* ...that much closer to allowing unconfiguration... */
761 cs->sc_openmask &= ~(1 << part);
770 int unit = ccdunit(bp->b_dev);
771 struct ccd_softc *cs = &ccd_softc[unit];
774 struct disklabel *lp;
777 if (ccddebug & CCDB_FOLLOW)
778 printf("ccdstrategy(%x): unit %d\n", bp, unit);
780 if ((cs->sc_flags & CCDF_INITED) == 0) {
782 bp->b_flags |= B_ERROR;
786 /* If it's a nil transfer, wake up the top half now. */
787 if (bp->b_bcount == 0)
793 * Do bounds checking and adjust transfer. If there's an
794 * error, the bounds check will flag that for us.
796 wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
797 if (ccdpart(bp->b_dev) != RAW_PART) {
798 if (bounds_check_with_label(bp, lp, wlabel) <= 0)
801 int pbn; /* in sc_secsize chunks */
802 long sz; /* in sc_secsize chunks */
804 pbn = bp->b_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
805 sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
808 * If out of bounds return an error. If at the EOF point,
809 * simply read or write less.
812 if (pbn < 0 || pbn >= cs->sc_size) {
813 bp->b_resid = bp->b_bcount;
814 if (pbn != cs->sc_size) {
815 bp->b_error = EINVAL;
816 bp->b_flags |= B_ERROR | B_INVAL;
822 * If the request crosses EOF, truncate the request.
824 if (pbn + sz > cs->sc_size) {
825 bp->b_bcount = (cs->sc_size - pbn) *
826 cs->sc_geom.ccg_secsize;
830 bp->b_resid = bp->b_bcount;
845 struct ccd_softc *cs;
849 struct ccdbuf *cbp[4];
850 /* XXX! : 2 reads and 2 writes for RAID 4/5 */
853 struct partition *pp;
856 if (ccddebug & CCDB_FOLLOW)
857 printf("ccdstart(%x, %x)\n", cs, bp);
860 /* Record the transaction start */
861 devstat_start_transaction(&cs->device_stats);
864 * Translate the partition-relative block number to an absolute.
867 if (ccdpart(bp->b_dev) != RAW_PART) {
868 pp = &cs->sc_label.d_partitions[ccdpart(bp->b_dev)];
873 * Allocate component buffers and fire off the requests
876 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
877 ccdbuffer(cbp, cs, bp, bn, addr, bcount);
878 rcount = cbp[0]->cb_buf.b_bcount;
880 if (cs->sc_cflags & CCDF_MIRROR) {
882 * Mirroring. Writes go to both disks, reads are
883 * taken from whichever disk seems most appropriate.
885 * We attempt to localize reads to the disk whos arm
886 * is nearest the read request. We ignore seeks due
887 * to writes when making this determination and we
888 * also try to avoid hogging.
890 if ((cbp[0]->cb_buf.b_flags & B_READ) == 0) {
891 cbp[0]->cb_buf.b_vp->v_numoutput++;
892 cbp[1]->cb_buf.b_vp->v_numoutput++;
893 VOP_STRATEGY(cbp[0]->cb_buf.b_vp,
895 VOP_STRATEGY(cbp[1]->cb_buf.b_vp,
898 int pick = cs->sc_pick;
899 daddr_t range = cs->sc_size / 16;
901 if (bn < cs->sc_blk[pick] - range ||
902 bn > cs->sc_blk[pick] + range
904 cs->sc_pick = pick = 1 - pick;
906 cs->sc_blk[pick] = bn + btodb(rcount);
907 VOP_STRATEGY(cbp[pick]->cb_buf.b_vp,
914 if ((cbp[0]->cb_buf.b_flags & B_READ) == 0)
915 cbp[0]->cb_buf.b_vp->v_numoutput++;
916 VOP_STRATEGY(cbp[0]->cb_buf.b_vp, &cbp[0]->cb_buf);
924 * Build a component buffer header.
927 ccdbuffer(cb, cs, bp, bn, addr, bcount)
929 struct ccd_softc *cs;
935 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
941 if (ccddebug & CCDB_IO)
942 printf("ccdbuffer(%x, %x, %d, %x, %d)\n",
943 cs, bp, bn, addr, bcount);
946 * Determine which component bn falls in.
951 if (cs->sc_ileave == 0) {
953 * Serially concatenated and neither a mirror nor a parity
954 * config. This is a special case.
959 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
967 * Calculate cbn, the logical superblock (sc_ileave chunks),
968 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
971 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
972 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
975 * Figure out which interleave table to use.
977 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
978 if (ii->ii_startblk > cbn)
984 * off is the logical superblock relative to the beginning
985 * of this interleave block.
987 off = cbn - ii->ii_startblk;
990 * We must calculate which disk component to use (ccdisk),
991 * and recalculate cbn to be the superblock relative to
992 * the beginning of the component. This is typically done by
993 * adding 'off' and ii->ii_startoff together. However, 'off'
994 * must typically be divided by the number of components in
995 * this interleave array to be properly convert it from a
996 * CCD-relative logical superblock number to a
997 * component-relative superblock number.
999 if (ii->ii_ndisk == 1) {
1001 * When we have just one disk, it can't be a mirror
1002 * or a parity config.
1004 ccdisk = ii->ii_index[0];
1005 cbn = ii->ii_startoff + off;
1007 if (cs->sc_cflags & CCDF_MIRROR) {
1009 * We have forced a uniform mapping, resulting
1010 * in a single interleave array. We double
1011 * up on the first half of the available
1012 * components and our mirror is in the second
1013 * half. This only works with a single
1014 * interleave array because doubling up
1015 * doubles the number of sectors, so there
1016 * cannot be another interleave array because
1017 * the next interleave array's calculations
1020 int ndisk2 = ii->ii_ndisk / 2;
1021 ccdisk = ii->ii_index[off % ndisk2];
1022 cbn = ii->ii_startoff + off / ndisk2;
1023 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1024 } else if (cs->sc_cflags & CCDF_PARITY) {
1026 * XXX not implemented yet
1028 int ndisk2 = ii->ii_ndisk - 1;
1029 ccdisk = ii->ii_index[off % ndisk2];
1030 cbn = ii->ii_startoff + off / ndisk2;
1031 if (cbn % ii->ii_ndisk <= ccdisk)
1034 ccdisk = ii->ii_index[off % ii->ii_ndisk];
1035 cbn = ii->ii_startoff + off / ii->ii_ndisk;
1039 ci = &cs->sc_cinfo[ccdisk];
1042 * Convert cbn from a superblock to a normal block so it
1043 * can be used to calculate (along with cboff) the normal
1044 * block index into this particular disk.
1046 cbn *= cs->sc_ileave;
1050 * Fill in the component buf structure.
1052 cbp = getccdbuf(NULL);
1053 cbp->cb_buf.b_flags = bp->b_flags | B_CALL;
1054 cbp->cb_buf.b_iodone = (void (*)(struct buf *))ccdiodone;
1055 cbp->cb_buf.b_dev = ci->ci_dev; /* XXX */
1056 cbp->cb_buf.b_blkno = cbn + cboff + CCD_OFFSET;
1057 cbp->cb_buf.b_offset = dbtob(cbn + cboff + CCD_OFFSET);
1058 cbp->cb_buf.b_data = addr;
1059 cbp->cb_buf.b_vp = ci->ci_vp;
1060 if (cs->sc_ileave == 0)
1061 cbc = dbtob((off_t)(ci->ci_size - cbn));
1063 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1064 cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1065 cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1068 * context for ccdiodone
1071 cbp->cb_unit = cs - ccd_softc;
1072 cbp->cb_comp = ci - cs->sc_cinfo;
1075 if (ccddebug & CCDB_IO)
1076 printf(" dev %x(u%d): cbp %x bn %d addr %x bcnt %d\n",
1077 ci->ci_dev, ci-cs->sc_cinfo, cbp, cbp->cb_buf.b_blkno,
1078 cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1083 * Note: both I/O's setup when reading from mirror, but only one
1086 if (cs->sc_cflags & CCDF_MIRROR) {
1087 /* mirror, setup second I/O */
1088 cbp = getccdbuf(cb[0]);
1089 cbp->cb_buf.b_dev = ci2->ci_dev;
1090 cbp->cb_buf.b_vp = ci2->ci_vp;
1091 cbp->cb_comp = ci2 - cs->sc_cinfo;
1093 /* link together the ccdbuf's and clear "mirror done" flag */
1094 cb[0]->cb_mirror = cb[1];
1095 cb[1]->cb_mirror = cb[0];
1096 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1097 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1103 struct ccd_softc *cs;
1107 if (ccddebug & CCDB_FOLLOW)
1108 printf("ccdintr(%x, %x)\n", cs, bp);
1111 * Request is done for better or worse, wakeup the top half.
1113 if (bp->b_flags & B_ERROR)
1114 bp->b_resid = bp->b_bcount;
1115 devstat_end_transaction_buf(&cs->device_stats, bp);
1120 * Called at interrupt time.
1121 * Mark the component as done and if all components are done,
1122 * take a ccd interrupt.
1128 struct buf *bp = cbp->cb_obp;
1129 int unit = cbp->cb_unit;
1134 if (ccddebug & CCDB_FOLLOW)
1135 printf("ccdiodone(%x)\n", cbp);
1136 if (ccddebug & CCDB_IO) {
1137 printf("ccdiodone: bp %x bcount %d resid %d\n",
1138 bp, bp->b_bcount, bp->b_resid);
1139 printf(" dev %x(u%d), cbp %x bn %d addr %x bcnt %d\n",
1140 cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1141 cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
1142 cbp->cb_buf.b_bcount);
1146 * If an error occured, report it. If this is a mirrored
1147 * configuration and the first of two possible reads, do not
1148 * set the error in the bp yet because the second read may
1152 if (cbp->cb_buf.b_flags & B_ERROR) {
1153 const char *msg = "";
1155 if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1156 (cbp->cb_buf.b_flags & B_READ) &&
1157 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1159 * We will try our read on the other disk down
1160 * below, also reverse the default pick so if we
1161 * are doing a scan we do not keep hitting the
1164 struct ccd_softc *cs = &ccd_softc[unit];
1166 msg = ", trying other disk";
1167 cs->sc_pick = 1 - cs->sc_pick;
1168 cs->sc_blk[cs->sc_pick] = bp->b_blkno;
1170 bp->b_flags |= B_ERROR;
1171 bp->b_error = cbp->cb_buf.b_error ?
1172 cbp->cb_buf.b_error : EIO;
1174 printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1175 unit, bp->b_error, cbp->cb_comp,
1176 (int)cbp->cb_buf.b_blkno, bp->b_blkno, msg);
1180 * Process mirror. If we are writing, I/O has been initiated on both
1181 * buffers and we fall through only after both are finished.
1183 * If we are reading only one I/O is initiated at a time. If an
1184 * error occurs we initiate the second I/O and return, otherwise
1185 * we free the second I/O without initiating it.
1188 if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1189 if ((cbp->cb_buf.b_flags & B_READ) == 0) {
1191 * When writing, handshake with the second buffer
1192 * to determine when both are done. If both are not
1193 * done, return here.
1195 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1196 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1203 * When reading, either dispose of the second buffer
1204 * or initiate I/O on the second buffer if an error
1205 * occured with this one.
1207 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1208 if (cbp->cb_buf.b_flags & B_ERROR) {
1209 cbp->cb_mirror->cb_pflags |=
1212 cbp->cb_mirror->cb_buf.b_vp,
1213 &cbp->cb_mirror->cb_buf
1219 putccdbuf(cbp->cb_mirror);
1227 * use b_bufsize to determine how big the original request was rather
1228 * then b_bcount, because b_bcount may have been truncated for EOF.
1230 * XXX We check for an error, but we do not test the resid for an
1231 * aligned EOF condition. This may result in character & block
1232 * device access not recognizing EOF properly when read or written
1233 * sequentially, but will not effect filesystems.
1235 count = cbp->cb_buf.b_bufsize;
1239 * If all done, "interrupt".
1241 bp->b_resid -= count;
1242 if (bp->b_resid < 0)
1243 panic("ccdiodone: count");
1244 if (bp->b_resid == 0)
1245 ccdintr(&ccd_softc[unit], bp);
1250 ccdioctl(dev, cmd, data, flag, p)
1257 int unit = ccdunit(dev);
1258 int i, j, lookedup = 0, error = 0;
1260 struct ccd_softc *cs;
1261 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1262 struct ccddevice ccd;
1268 cs = &ccd_softc[unit];
1270 bzero(&ccd, sizeof(ccd));
1274 if (cs->sc_flags & CCDF_INITED)
1277 if ((flag & FWRITE) == 0)
1280 if ((error = ccdlock(cs)) != 0)
1283 if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1286 /* Fill in some important bits. */
1287 ccd.ccd_unit = unit;
1288 ccd.ccd_interleave = ccio->ccio_ileave;
1289 if (ccd.ccd_interleave == 0 &&
1290 ((ccio->ccio_flags & CCDF_MIRROR) ||
1291 (ccio->ccio_flags & CCDF_PARITY))) {
1292 printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1293 ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1295 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1296 (ccio->ccio_flags & CCDF_PARITY)) {
1297 printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1298 ccio->ccio_flags &= ~CCDF_PARITY;
1300 if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1301 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1302 printf("ccd%d: mirror/parity forces uniform flag\n",
1304 ccio->ccio_flags |= CCDF_UNIFORM;
1306 ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1309 * Allocate space for and copy in the array of
1310 * componet pathnames and device numbers.
1312 cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1313 M_DEVBUF, M_WAITOK);
1314 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1315 M_DEVBUF, M_WAITOK);
1317 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1318 ccio->ccio_ndisks * sizeof(char **));
1320 free(vpp, M_DEVBUF);
1321 free(cpp, M_DEVBUF);
1327 if (ccddebug & CCDB_INIT)
1328 for (i = 0; i < ccio->ccio_ndisks; ++i)
1329 printf("ccdioctl: component %d: 0x%x\n",
1333 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1335 if (ccddebug & CCDB_INIT)
1336 printf("ccdioctl: lookedup = %d\n", lookedup);
1338 if ((error = ccdlookup(cpp[i], p, &vpp[i])) != 0) {
1339 for (j = 0; j < lookedup; ++j)
1340 (void)vn_close(vpp[j], FREAD|FWRITE,
1342 free(vpp, M_DEVBUF);
1343 free(cpp, M_DEVBUF);
1351 ccd.ccd_ndev = ccio->ccio_ndisks;
1354 * Initialize the ccd. Fills in the softc for us.
1356 if ((error = ccdinit(&ccd, cpp, p)) != 0) {
1357 for (j = 0; j < lookedup; ++j)
1358 (void)vn_close(vpp[j], FREAD|FWRITE,
1360 bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1361 free(vpp, M_DEVBUF);
1362 free(cpp, M_DEVBUF);
1368 * The ccd has been successfully initialized, so
1369 * we can place it into the array and read the disklabel.
1371 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1372 ccio->ccio_unit = unit;
1373 ccio->ccio_size = cs->sc_size;
1374 ccdgetdisklabel(dev);
1381 if ((cs->sc_flags & CCDF_INITED) == 0)
1384 if ((flag & FWRITE) == 0)
1387 if ((error = ccdlock(cs)) != 0)
1390 /* Don't unconfigure if any other partitions are open */
1391 part = ccdpart(dev);
1392 pmask = (1 << part);
1393 if ((cs->sc_openmask & ~pmask)) {
1399 * Free ccd_softc information and clear entry.
1402 /* Close the components and free their pathnames. */
1403 for (i = 0; i < cs->sc_nccdisks; ++i) {
1405 * XXX: this close could potentially fail and
1406 * cause Bad Things. Maybe we need to force
1407 * the close to happen?
1410 if (ccddebug & CCDB_VNODE)
1411 vprint("CCDIOCCLR: vnode info",
1412 cs->sc_cinfo[i].ci_vp);
1414 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1416 free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1419 /* Free interleave index. */
1420 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1421 free(cs->sc_itable[i].ii_index, M_DEVBUF);
1423 /* Free component info and interleave table. */
1424 free(cs->sc_cinfo, M_DEVBUF);
1425 free(cs->sc_itable, M_DEVBUF);
1426 cs->sc_flags &= ~CCDF_INITED;
1429 * Free ccddevice information and clear entry.
1431 free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1432 free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1434 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1437 * And remove the devstat entry.
1439 devstat_remove_entry(&cs->device_stats);
1441 /* This must be atomic. */
1444 bzero(cs, sizeof(struct ccd_softc));
1450 if ((cs->sc_flags & CCDF_INITED) == 0)
1453 *(struct disklabel *)data = cs->sc_label;
1457 if ((cs->sc_flags & CCDF_INITED) == 0)
1460 ((struct partinfo *)data)->disklab = &cs->sc_label;
1461 ((struct partinfo *)data)->part =
1462 &cs->sc_label.d_partitions[ccdpart(dev)];
1467 if ((cs->sc_flags & CCDF_INITED) == 0)
1470 if ((flag & FWRITE) == 0)
1473 if ((error = ccdlock(cs)) != 0)
1476 cs->sc_flags |= CCDF_LABELLING;
1478 error = setdisklabel(&cs->sc_label,
1479 (struct disklabel *)data, 0);
1481 if (cmd == DIOCWDINFO)
1482 error = writedisklabel(CCDLABELDEV(dev),
1486 cs->sc_flags &= ~CCDF_LABELLING;
1495 if ((cs->sc_flags & CCDF_INITED) == 0)
1498 if ((flag & FWRITE) == 0)
1500 if (*(int *)data != 0)
1501 cs->sc_flags |= CCDF_WLABEL;
1503 cs->sc_flags &= ~CCDF_WLABEL;
1517 struct ccd_softc *cs;
1520 if (ccdopen(dev, 0, S_IFCHR, curproc))
1523 cs = &ccd_softc[ccdunit(dev)];
1524 part = ccdpart(dev);
1526 if ((cs->sc_flags & CCDF_INITED) == 0)
1529 if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1532 size = cs->sc_label.d_partitions[part].p_size;
1534 if (ccdclose(dev, 0, S_IFCHR, curproc))
1545 /* Not implemented. */
1550 * Lookup the provided name in the filesystem. If the file exists,
1551 * is a valid block device, and isn't being used by anyone else,
1552 * set *vpp to the file's vnode.
1555 ccdlookup(path, p, vpp)
1558 struct vnode **vpp; /* result */
1560 struct nameidata nd;
1564 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, p);
1565 if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
1567 if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1568 printf("ccdlookup: vn_open error = %d\n", error);
1574 if (vp->v_usecount > 1) {
1579 if (!vn_isdisk(vp, &error))
1583 if (ccddebug & CCDB_VNODE)
1584 vprint("ccdlookup: vnode info", vp);
1587 VOP_UNLOCK(vp, 0, p);
1588 NDFREE(&nd, NDF_ONLY_PNBUF);
1592 VOP_UNLOCK(vp, 0, p);
1593 NDFREE(&nd, NDF_ONLY_PNBUF);
1594 /* vn_close does vrele() for vp */
1595 (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
1600 * Read the disklabel from the ccd. If one is not present, fake one
1604 ccdgetdisklabel(dev)
1607 int unit = ccdunit(dev);
1608 struct ccd_softc *cs = &ccd_softc[unit];
1610 struct disklabel *lp = &cs->sc_label;
1611 struct ccdgeom *ccg = &cs->sc_geom;
1613 bzero(lp, sizeof(*lp));
1615 lp->d_secperunit = cs->sc_size;
1616 lp->d_secsize = ccg->ccg_secsize;
1617 lp->d_nsectors = ccg->ccg_nsectors;
1618 lp->d_ntracks = ccg->ccg_ntracks;
1619 lp->d_ncylinders = ccg->ccg_ncylinders;
1620 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1622 strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1623 lp->d_type = DTYPE_CCD;
1624 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1626 lp->d_interleave = 1;
1629 lp->d_partitions[RAW_PART].p_offset = 0;
1630 lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1631 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1632 lp->d_npartitions = RAW_PART + 1;
1634 lp->d_bbsize = BBSIZE; /* XXX */
1635 lp->d_sbsize = SBSIZE; /* XXX */
1637 lp->d_magic = DISKMAGIC;
1638 lp->d_magic2 = DISKMAGIC;
1639 lp->d_checksum = dkcksum(&cs->sc_label);
1642 * Call the generic disklabel extraction routine.
1644 errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1645 if (errstring != NULL)
1646 ccdmakedisklabel(cs);
1649 /* It's actually extremely common to have unlabeled ccds. */
1650 if (ccddebug & CCDB_LABEL)
1651 if (errstring != NULL)
1652 printf("ccd%d: %s\n", unit, errstring);
1657 * Take care of things one might want to take care of in the event
1658 * that a disklabel isn't present.
1661 ccdmakedisklabel(cs)
1662 struct ccd_softc *cs;
1664 struct disklabel *lp = &cs->sc_label;
1667 * For historical reasons, if there's no disklabel present
1668 * the raw partition must be marked FS_BSDFFS.
1670 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1672 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1676 * Wait interruptibly for an exclusive lock.
1679 * Several drivers do this; it should be abstracted and made MP-safe.
1683 struct ccd_softc *cs;
1687 while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1688 cs->sc_flags |= CCDF_WANTED;
1689 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1692 cs->sc_flags |= CCDF_LOCKED;
1697 * Unlock and wake up any waiters.
1701 struct ccd_softc *cs;
1704 cs->sc_flags &= ~CCDF_LOCKED;
1705 if ((cs->sc_flags & CCDF_WANTED) != 0) {
1706 cs->sc_flags &= ~CCDF_WANTED;
1714 struct ccdiinfo *ii;
1718 for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1719 printf(" itab[%d]: #dk %d sblk %d soff %d",
1720 ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1721 for (i = 0; i < ii->ii_ndisk; i++)
1722 printf(" %d", ii->ii_index[i]);
1729 /* Local Variables: */
1730 /* c-argdecl-indent: 8 */
1731 /* c-continued-statement-offset: 8 */
1732 /* c-indent-level: 8 */