1 /* $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $ */
2 /* $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.2 2003/06/17 04:28:23 dillon Exp $ */
4 /* $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ */
7 * Copyright (c) 1995 Jason R. Thorpe.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed for the NetBSD Project
22 * 4. The name of the author may not be used to endorse or promote products
23 * derived from this software without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
26 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
27 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
28 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
29 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * Copyright (c) 1988 University of Utah.
40 * Copyright (c) 1990, 1993
41 * The Regents of the University of California. All rights reserved.
43 * This code is derived from software contributed to Berkeley by
44 * the Systems Programming Group of the University of Utah Computer
47 * Redistribution and use in source and binary forms, with or without
48 * modification, are permitted provided that the following conditions
50 * 1. Redistributions of source code must retain the above copyright
51 * notice, this list of conditions and the following disclaimer.
52 * 2. Redistributions in binary form must reproduce the above copyright
53 * notice, this list of conditions and the following disclaimer in the
54 * documentation and/or other materials provided with the distribution.
55 * 3. All advertising materials mentioning features or use of this software
56 * must display the following acknowledgement:
57 * This product includes software developed by the University of
58 * California, Berkeley and its contributors.
59 * 4. Neither the name of the University nor the names of its contributors
60 * may be used to endorse or promote products derived from this software
61 * without specific prior written permission.
63 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
75 * from: Utah $Hdr: cd.c 1.6 90/11/28$
77 * @(#)cd.c 8.2 (Berkeley) 11/16/93
81 * "Concatenated" disk driver.
83 * Dynamic configuration and disklabel support by:
84 * Jason R. Thorpe <thorpej@nas.nasa.gov>
85 * Numerical Aerodynamic Simulation Facility
87 * NASA Ames Research Center
88 * Moffett Field, CA 94035
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/kernel.h>
96 #include <sys/module.h>
99 #include <sys/malloc.h>
100 #include <sys/namei.h>
101 #include <sys/conf.h>
102 #include <sys/stat.h>
103 #include <sys/sysctl.h>
104 #include <sys/disklabel.h>
105 #include <ufs/ffs/fs.h>
106 #include <sys/devicestat.h>
107 #include <sys/fcntl.h>
108 #include <sys/vnode.h>
110 #include <sys/ccdvar.h>
112 #include <vm/vm_zone.h>
114 #if defined(CCDDEBUG) && !defined(DEBUG)
119 #define CCDB_FOLLOW 0x01
120 #define CCDB_INIT 0x02
122 #define CCDB_LABEL 0x08
123 #define CCDB_VNODE 0x10
124 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
126 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
130 #define ccdunit(x) dkunit(x)
131 #define ccdpart(x) dkpart(x)
134 This is how mirroring works (only writes are special):
136 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
137 linked together by the cb_mirror field. "cb_pflags &
138 CCDPF_MIRROR_DONE" is set to 0 on both of them.
140 When a component returns to ccdiodone(), it checks if "cb_pflags &
141 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
142 flag and returns. If it is, it means its partner has already
143 returned, so it will go to the regular cleanup.
148 struct buf cb_buf; /* new I/O buf */
149 struct buf *cb_obp; /* ptr. to original I/O buf */
150 struct ccdbuf *cb_freenext; /* free list link */
151 int cb_unit; /* target unit */
152 int cb_comp; /* target component */
153 int cb_pflags; /* mirror/parity status flag */
154 struct ccdbuf *cb_mirror; /* mirror counterpart */
157 /* bits in cb_pflags */
158 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
160 #define CCDLABELDEV(dev) \
161 (makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
163 static d_open_t ccdopen;
164 static d_close_t ccdclose;
165 static d_strategy_t ccdstrategy;
166 static d_ioctl_t ccdioctl;
167 static d_dump_t ccddump;
168 static d_psize_t ccdsize;
170 #define NCCDFREEHIWAT 16
172 #define CDEV_MAJOR 74
173 #define BDEV_MAJOR 21
175 static struct cdevsw ccd_cdevsw = {
177 /* close */ ccdclose,
179 /* write */ physwrite,
180 /* ioctl */ ccdioctl,
183 /* strategy */ ccdstrategy,
185 /* maj */ CDEV_MAJOR,
189 /* bmaj */ BDEV_MAJOR
192 /* called during module initialization */
193 static void ccdattach __P((void));
194 static int ccd_modevent __P((module_t, int, void *));
196 /* called by biodone() at interrupt time */
197 static void ccdiodone __P((struct ccdbuf *cbp));
199 static void ccdstart __P((struct ccd_softc *, struct buf *));
200 static void ccdinterleave __P((struct ccd_softc *, int));
201 static void ccdintr __P((struct ccd_softc *, struct buf *));
202 static int ccdinit __P((struct ccddevice *, char **, struct proc *));
203 static int ccdlookup __P((char *, struct proc *p, struct vnode **));
204 static void ccdbuffer __P((struct ccdbuf **ret, struct ccd_softc *,
205 struct buf *, daddr_t, caddr_t, long));
206 static void ccdgetdisklabel __P((dev_t));
207 static void ccdmakedisklabel __P((struct ccd_softc *));
208 static int ccdlock __P((struct ccd_softc *));
209 static void ccdunlock __P((struct ccd_softc *));
212 static void printiinfo __P((struct ccdiinfo *));
215 /* Non-private for the benefit of libkvm. */
216 struct ccd_softc *ccd_softc;
217 struct ccddevice *ccddevs;
218 struct ccdbuf *ccdfreebufs;
219 static int numccdfreebufs;
220 static int numccd = 0;
223 * getccdbuf() - Allocate and zero a ccd buffer.
225 * This routine is called at splbio().
230 getccdbuf(struct ccdbuf *cpy)
235 * Allocate from freelist or malloc as necessary
237 if ((cbp = ccdfreebufs) != NULL) {
238 ccdfreebufs = cbp->cb_freenext;
241 cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
245 * Used by mirroring code
248 bcopy(cpy, cbp, sizeof(struct ccdbuf));
250 bzero(cbp, sizeof(struct ccdbuf));
253 * independant struct buf initialization
255 LIST_INIT(&cbp->cb_buf.b_dep);
256 BUF_LOCKINIT(&cbp->cb_buf);
257 BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
258 BUF_KERNPROC(&cbp->cb_buf);
264 * putccdbuf() - Free a ccd buffer.
266 * This routine is called at splbio().
271 putccdbuf(struct ccdbuf *cbp)
273 BUF_UNLOCK(&cbp->cb_buf);
274 BUF_LOCKFREE(&cbp->cb_buf);
276 if (numccdfreebufs < NCCDFREEHIWAT) {
277 cbp->cb_freenext = ccdfreebufs;
281 free((caddr_t)cbp, M_DEVBUF);
287 * Number of blocks to untouched in front of a component partition.
288 * This is to avoid violating its disklabel area when it starts at the
289 * beginning of the slice.
291 #if !defined(CCD_OFFSET)
292 #define CCD_OFFSET 16
296 * Called by main() during pseudo-device attachment. All we need
297 * to do is allocate enough space for devices to be configured later, and
307 printf("ccd0-%d: Concatenated disk drivers\n", num-1);
309 printf("ccd0: Concatenated disk driver\n");
311 ccd_softc = (struct ccd_softc *)malloc(num * sizeof(struct ccd_softc),
313 ccddevs = (struct ccddevice *)malloc(num * sizeof(struct ccddevice),
315 if ((ccd_softc == NULL) || (ccddevs == NULL)) {
316 printf("WARNING: no memory for concatenated disks\n");
317 if (ccd_softc != NULL)
318 free(ccd_softc, M_DEVBUF);
320 free(ccddevs, M_DEVBUF);
324 bzero(ccd_softc, num * sizeof(struct ccd_softc));
325 bzero(ccddevs, num * sizeof(struct ccddevice));
327 cdevsw_add(&ccd_cdevsw);
328 /* XXX: is this necessary? */
329 for (i = 0; i < numccd; ++i)
330 ccddevs[i].ccd_dk = -1;
334 ccd_modevent(mod, type, data)
347 printf("ccd0: Unload not supported!\n");
351 default: /* MOD_SHUTDOWN etc */
357 DEV_MODULE(ccd, ccd_modevent, NULL);
360 ccdinit(ccd, cpaths, p)
361 struct ccddevice *ccd;
365 struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
366 struct ccdcinfo *ci = NULL; /* XXX */
372 struct partinfo dpart;
373 struct ccdgeom *ccg = &cs->sc_geom;
374 char tmppath[MAXPATHLEN];
378 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
379 printf("ccdinit: unit %d\n", ccd->ccd_unit);
383 cs->sc_ileave = ccd->ccd_interleave;
384 cs->sc_nccdisks = ccd->ccd_ndev;
386 /* Allocate space for the component info. */
387 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
391 * Verify that each component piece exists and record
392 * relevant information about it.
396 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
397 vp = ccd->ccd_vpp[ix];
398 ci = &cs->sc_cinfo[ix];
402 * Copy in the pathname of the component.
404 bzero(tmppath, sizeof(tmppath)); /* sanity */
405 if ((error = copyinstr(cpaths[ix], tmppath,
406 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
408 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
409 printf("ccd%d: can't copy path, error = %d\n",
410 ccd->ccd_unit, error);
414 ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
415 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
417 ci->ci_dev = vn_todev(vp);
420 * Get partition information for the component.
422 if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
423 FREAD, p->p_ucred, p)) != 0) {
425 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
426 printf("ccd%d: %s: ioctl failed, error = %d\n",
427 ccd->ccd_unit, ci->ci_path, error);
431 if (dpart.part->p_fstype == FS_BSDFFS) {
433 ((dpart.disklab->d_secsize > maxsecsize) ?
434 dpart.disklab->d_secsize : maxsecsize);
435 size = dpart.part->p_size - CCD_OFFSET;
438 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
439 printf("ccd%d: %s: incorrect partition type\n",
440 ccd->ccd_unit, ci->ci_path);
447 * Calculate the size, truncating to an interleave
448 * boundary if necessary.
451 if (cs->sc_ileave > 1)
452 size -= size % cs->sc_ileave;
456 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
457 printf("ccd%d: %s: size == 0\n",
458 ccd->ccd_unit, ci->ci_path);
464 if (minsize == 0 || size < minsize)
471 * Don't allow the interleave to be smaller than
472 * the biggest component sector.
474 if ((cs->sc_ileave > 0) &&
475 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
477 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
478 printf("ccd%d: interleave must be at least %d\n",
479 ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
486 * If uniform interleave is desired set all sizes to that of
487 * the smallest component. This will guarentee that a single
488 * interleave table is generated.
490 * Lost space must be taken into account when calculating the
491 * overall size. Half the space is lost when CCDF_MIRROR is
492 * specified. One disk is lost when CCDF_PARITY is specified.
494 if (ccd->ccd_flags & CCDF_UNIFORM) {
495 for (ci = cs->sc_cinfo;
496 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
497 ci->ci_size = minsize;
499 if (ccd->ccd_flags & CCDF_MIRROR) {
501 * Check to see if an even number of components
502 * have been specified. The interleave must also
503 * be non-zero in order for us to be able to
504 * guarentee the topology.
506 if (cs->sc_nccdisks % 2) {
507 printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
511 if (cs->sc_ileave == 0) {
512 printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
516 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
517 } else if (ccd->ccd_flags & CCDF_PARITY) {
518 cs->sc_size = (cs->sc_nccdisks-1) * minsize;
520 if (cs->sc_ileave == 0) {
521 printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
525 cs->sc_size = cs->sc_nccdisks * minsize;
530 * Construct the interleave table.
532 ccdinterleave(cs, ccd->ccd_unit);
535 * Create pseudo-geometry based on 1MB cylinders. It's
538 ccg->ccg_secsize = maxsecsize;
539 ccg->ccg_ntracks = 1;
540 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
541 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
544 * Add an devstat entry for this device.
546 devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
547 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
548 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
549 DEVSTAT_PRIORITY_ARRAY);
551 cs->sc_flags |= CCDF_INITED;
552 cs->sc_cflags = ccd->ccd_flags; /* So we can find out later... */
553 cs->sc_unit = ccd->ccd_unit;
556 while (ci > cs->sc_cinfo) {
558 free(ci->ci_path, M_DEVBUF);
560 free(cs->sc_cinfo, M_DEVBUF);
565 ccdinterleave(cs, unit)
566 struct ccd_softc *cs;
569 struct ccdcinfo *ci, *smallci;
576 if (ccddebug & CCDB_INIT)
577 printf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
581 * Allocate an interleave table. The worst case occurs when each
582 * of N disks is of a different size, resulting in N interleave
585 * Chances are this is too big, but we don't care.
587 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
588 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF, M_WAITOK);
589 bzero((caddr_t)cs->sc_itable, size);
592 * Trivial case: no interleave (actually interleave of disk size).
593 * Each table entry represents a single component in its entirety.
595 * An interleave of 0 may not be used with a mirror or parity setup.
597 if (cs->sc_ileave == 0) {
601 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
602 /* Allocate space for ii_index. */
603 ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
605 ii->ii_startblk = bn;
607 ii->ii_index[0] = ix;
608 bn += cs->sc_cinfo[ix].ci_size;
613 if (ccddebug & CCDB_INIT)
614 printiinfo(cs->sc_itable);
620 * The following isn't fast or pretty; it doesn't have to be.
624 for (ii = cs->sc_itable; ; ii++) {
626 * Allocate space for ii_index. We might allocate more then
629 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
633 * Locate the smallest of the remaining components
636 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
638 if (ci->ci_size > size &&
640 ci->ci_size < smallci->ci_size)) {
646 * Nobody left, all done
648 if (smallci == NULL) {
654 * Record starting logical block using an sc_ileave blocksize.
656 ii->ii_startblk = bn / cs->sc_ileave;
659 * Record starting comopnent block using an sc_ileave
660 * blocksize. This value is relative to the beginning of
663 ii->ii_startoff = lbn;
666 * Determine how many disks take part in this interleave
667 * and record their indices.
670 for (ci = cs->sc_cinfo;
671 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
672 if (ci->ci_size >= smallci->ci_size) {
673 ii->ii_index[ix++] = ci - cs->sc_cinfo;
677 bn += ix * (smallci->ci_size - size);
678 lbn = smallci->ci_size / cs->sc_ileave;
679 size = smallci->ci_size;
682 if (ccddebug & CCDB_INIT)
683 printiinfo(cs->sc_itable);
689 ccdopen(dev, flags, fmt, p)
694 int unit = ccdunit(dev);
695 struct ccd_softc *cs;
696 struct disklabel *lp;
697 int error = 0, part, pmask;
700 if (ccddebug & CCDB_FOLLOW)
701 printf("ccdopen(%x, %x)\n", dev, flags);
705 cs = &ccd_softc[unit];
707 if ((error = ccdlock(cs)) != 0)
716 * If we're initialized, check to see if there are any other
717 * open partitions. If not, then it's safe to update
718 * the in-core disklabel.
720 if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
721 ccdgetdisklabel(dev);
723 /* Check that the partition exists. */
724 if (part != RAW_PART && ((part >= lp->d_npartitions) ||
725 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
730 cs->sc_openmask |= pmask;
738 ccdclose(dev, flags, fmt, p)
743 int unit = ccdunit(dev);
744 struct ccd_softc *cs;
748 if (ccddebug & CCDB_FOLLOW)
749 printf("ccdclose(%x, %x)\n", dev, flags);
754 cs = &ccd_softc[unit];
756 if ((error = ccdlock(cs)) != 0)
761 /* ...that much closer to allowing unconfiguration... */
762 cs->sc_openmask &= ~(1 << part);
771 int unit = ccdunit(bp->b_dev);
772 struct ccd_softc *cs = &ccd_softc[unit];
775 struct disklabel *lp;
778 if (ccddebug & CCDB_FOLLOW)
779 printf("ccdstrategy(%x): unit %d\n", bp, unit);
781 if ((cs->sc_flags & CCDF_INITED) == 0) {
783 bp->b_flags |= B_ERROR;
787 /* If it's a nil transfer, wake up the top half now. */
788 if (bp->b_bcount == 0)
794 * Do bounds checking and adjust transfer. If there's an
795 * error, the bounds check will flag that for us.
797 wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
798 if (ccdpart(bp->b_dev) != RAW_PART) {
799 if (bounds_check_with_label(bp, lp, wlabel) <= 0)
802 int pbn; /* in sc_secsize chunks */
803 long sz; /* in sc_secsize chunks */
805 pbn = bp->b_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
806 sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
809 * If out of bounds return an error. If at the EOF point,
810 * simply read or write less.
813 if (pbn < 0 || pbn >= cs->sc_size) {
814 bp->b_resid = bp->b_bcount;
815 if (pbn != cs->sc_size) {
816 bp->b_error = EINVAL;
817 bp->b_flags |= B_ERROR | B_INVAL;
823 * If the request crosses EOF, truncate the request.
825 if (pbn + sz > cs->sc_size) {
826 bp->b_bcount = (cs->sc_size - pbn) *
827 cs->sc_geom.ccg_secsize;
831 bp->b_resid = bp->b_bcount;
846 struct ccd_softc *cs;
850 struct ccdbuf *cbp[4];
851 /* XXX! : 2 reads and 2 writes for RAID 4/5 */
854 struct partition *pp;
857 if (ccddebug & CCDB_FOLLOW)
858 printf("ccdstart(%x, %x)\n", cs, bp);
861 /* Record the transaction start */
862 devstat_start_transaction(&cs->device_stats);
865 * Translate the partition-relative block number to an absolute.
868 if (ccdpart(bp->b_dev) != RAW_PART) {
869 pp = &cs->sc_label.d_partitions[ccdpart(bp->b_dev)];
874 * Allocate component buffers and fire off the requests
877 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
878 ccdbuffer(cbp, cs, bp, bn, addr, bcount);
879 rcount = cbp[0]->cb_buf.b_bcount;
881 if (cs->sc_cflags & CCDF_MIRROR) {
883 * Mirroring. Writes go to both disks, reads are
884 * taken from whichever disk seems most appropriate.
886 * We attempt to localize reads to the disk whos arm
887 * is nearest the read request. We ignore seeks due
888 * to writes when making this determination and we
889 * also try to avoid hogging.
891 if ((cbp[0]->cb_buf.b_flags & B_READ) == 0) {
892 cbp[0]->cb_buf.b_vp->v_numoutput++;
893 cbp[1]->cb_buf.b_vp->v_numoutput++;
894 VOP_STRATEGY(cbp[0]->cb_buf.b_vp,
896 VOP_STRATEGY(cbp[1]->cb_buf.b_vp,
899 int pick = cs->sc_pick;
900 daddr_t range = cs->sc_size / 16;
902 if (bn < cs->sc_blk[pick] - range ||
903 bn > cs->sc_blk[pick] + range
905 cs->sc_pick = pick = 1 - pick;
907 cs->sc_blk[pick] = bn + btodb(rcount);
908 VOP_STRATEGY(cbp[pick]->cb_buf.b_vp,
915 if ((cbp[0]->cb_buf.b_flags & B_READ) == 0)
916 cbp[0]->cb_buf.b_vp->v_numoutput++;
917 VOP_STRATEGY(cbp[0]->cb_buf.b_vp, &cbp[0]->cb_buf);
925 * Build a component buffer header.
928 ccdbuffer(cb, cs, bp, bn, addr, bcount)
930 struct ccd_softc *cs;
936 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
942 if (ccddebug & CCDB_IO)
943 printf("ccdbuffer(%x, %x, %d, %x, %d)\n",
944 cs, bp, bn, addr, bcount);
947 * Determine which component bn falls in.
952 if (cs->sc_ileave == 0) {
954 * Serially concatenated and neither a mirror nor a parity
955 * config. This is a special case.
960 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
968 * Calculate cbn, the logical superblock (sc_ileave chunks),
969 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
972 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
973 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
976 * Figure out which interleave table to use.
978 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
979 if (ii->ii_startblk > cbn)
985 * off is the logical superblock relative to the beginning
986 * of this interleave block.
988 off = cbn - ii->ii_startblk;
991 * We must calculate which disk component to use (ccdisk),
992 * and recalculate cbn to be the superblock relative to
993 * the beginning of the component. This is typically done by
994 * adding 'off' and ii->ii_startoff together. However, 'off'
995 * must typically be divided by the number of components in
996 * this interleave array to be properly convert it from a
997 * CCD-relative logical superblock number to a
998 * component-relative superblock number.
1000 if (ii->ii_ndisk == 1) {
1002 * When we have just one disk, it can't be a mirror
1003 * or a parity config.
1005 ccdisk = ii->ii_index[0];
1006 cbn = ii->ii_startoff + off;
1008 if (cs->sc_cflags & CCDF_MIRROR) {
1010 * We have forced a uniform mapping, resulting
1011 * in a single interleave array. We double
1012 * up on the first half of the available
1013 * components and our mirror is in the second
1014 * half. This only works with a single
1015 * interleave array because doubling up
1016 * doubles the number of sectors, so there
1017 * cannot be another interleave array because
1018 * the next interleave array's calculations
1021 int ndisk2 = ii->ii_ndisk / 2;
1022 ccdisk = ii->ii_index[off % ndisk2];
1023 cbn = ii->ii_startoff + off / ndisk2;
1024 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1025 } else if (cs->sc_cflags & CCDF_PARITY) {
1027 * XXX not implemented yet
1029 int ndisk2 = ii->ii_ndisk - 1;
1030 ccdisk = ii->ii_index[off % ndisk2];
1031 cbn = ii->ii_startoff + off / ndisk2;
1032 if (cbn % ii->ii_ndisk <= ccdisk)
1035 ccdisk = ii->ii_index[off % ii->ii_ndisk];
1036 cbn = ii->ii_startoff + off / ii->ii_ndisk;
1040 ci = &cs->sc_cinfo[ccdisk];
1043 * Convert cbn from a superblock to a normal block so it
1044 * can be used to calculate (along with cboff) the normal
1045 * block index into this particular disk.
1047 cbn *= cs->sc_ileave;
1051 * Fill in the component buf structure.
1053 cbp = getccdbuf(NULL);
1054 cbp->cb_buf.b_flags = bp->b_flags | B_CALL;
1055 cbp->cb_buf.b_iodone = (void (*)(struct buf *))ccdiodone;
1056 cbp->cb_buf.b_dev = ci->ci_dev; /* XXX */
1057 cbp->cb_buf.b_blkno = cbn + cboff + CCD_OFFSET;
1058 cbp->cb_buf.b_offset = dbtob(cbn + cboff + CCD_OFFSET);
1059 cbp->cb_buf.b_data = addr;
1060 cbp->cb_buf.b_vp = ci->ci_vp;
1061 if (cs->sc_ileave == 0)
1062 cbc = dbtob((off_t)(ci->ci_size - cbn));
1064 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1065 cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1066 cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1069 * context for ccdiodone
1072 cbp->cb_unit = cs - ccd_softc;
1073 cbp->cb_comp = ci - cs->sc_cinfo;
1076 if (ccddebug & CCDB_IO)
1077 printf(" dev %x(u%d): cbp %x bn %d addr %x bcnt %d\n",
1078 ci->ci_dev, ci-cs->sc_cinfo, cbp, cbp->cb_buf.b_blkno,
1079 cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1084 * Note: both I/O's setup when reading from mirror, but only one
1087 if (cs->sc_cflags & CCDF_MIRROR) {
1088 /* mirror, setup second I/O */
1089 cbp = getccdbuf(cb[0]);
1090 cbp->cb_buf.b_dev = ci2->ci_dev;
1091 cbp->cb_buf.b_vp = ci2->ci_vp;
1092 cbp->cb_comp = ci2 - cs->sc_cinfo;
1094 /* link together the ccdbuf's and clear "mirror done" flag */
1095 cb[0]->cb_mirror = cb[1];
1096 cb[1]->cb_mirror = cb[0];
1097 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1098 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1104 struct ccd_softc *cs;
1108 if (ccddebug & CCDB_FOLLOW)
1109 printf("ccdintr(%x, %x)\n", cs, bp);
1112 * Request is done for better or worse, wakeup the top half.
1114 if (bp->b_flags & B_ERROR)
1115 bp->b_resid = bp->b_bcount;
1116 devstat_end_transaction_buf(&cs->device_stats, bp);
1121 * Called at interrupt time.
1122 * Mark the component as done and if all components are done,
1123 * take a ccd interrupt.
1129 struct buf *bp = cbp->cb_obp;
1130 int unit = cbp->cb_unit;
1135 if (ccddebug & CCDB_FOLLOW)
1136 printf("ccdiodone(%x)\n", cbp);
1137 if (ccddebug & CCDB_IO) {
1138 printf("ccdiodone: bp %x bcount %d resid %d\n",
1139 bp, bp->b_bcount, bp->b_resid);
1140 printf(" dev %x(u%d), cbp %x bn %d addr %x bcnt %d\n",
1141 cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1142 cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
1143 cbp->cb_buf.b_bcount);
1147 * If an error occured, report it. If this is a mirrored
1148 * configuration and the first of two possible reads, do not
1149 * set the error in the bp yet because the second read may
1153 if (cbp->cb_buf.b_flags & B_ERROR) {
1154 const char *msg = "";
1156 if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1157 (cbp->cb_buf.b_flags & B_READ) &&
1158 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1160 * We will try our read on the other disk down
1161 * below, also reverse the default pick so if we
1162 * are doing a scan we do not keep hitting the
1165 struct ccd_softc *cs = &ccd_softc[unit];
1167 msg = ", trying other disk";
1168 cs->sc_pick = 1 - cs->sc_pick;
1169 cs->sc_blk[cs->sc_pick] = bp->b_blkno;
1171 bp->b_flags |= B_ERROR;
1172 bp->b_error = cbp->cb_buf.b_error ?
1173 cbp->cb_buf.b_error : EIO;
1175 printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1176 unit, bp->b_error, cbp->cb_comp,
1177 (int)cbp->cb_buf.b_blkno, bp->b_blkno, msg);
1181 * Process mirror. If we are writing, I/O has been initiated on both
1182 * buffers and we fall through only after both are finished.
1184 * If we are reading only one I/O is initiated at a time. If an
1185 * error occurs we initiate the second I/O and return, otherwise
1186 * we free the second I/O without initiating it.
1189 if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1190 if ((cbp->cb_buf.b_flags & B_READ) == 0) {
1192 * When writing, handshake with the second buffer
1193 * to determine when both are done. If both are not
1194 * done, return here.
1196 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1197 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1204 * When reading, either dispose of the second buffer
1205 * or initiate I/O on the second buffer if an error
1206 * occured with this one.
1208 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1209 if (cbp->cb_buf.b_flags & B_ERROR) {
1210 cbp->cb_mirror->cb_pflags |=
1213 cbp->cb_mirror->cb_buf.b_vp,
1214 &cbp->cb_mirror->cb_buf
1220 putccdbuf(cbp->cb_mirror);
1228 * use b_bufsize to determine how big the original request was rather
1229 * then b_bcount, because b_bcount may have been truncated for EOF.
1231 * XXX We check for an error, but we do not test the resid for an
1232 * aligned EOF condition. This may result in character & block
1233 * device access not recognizing EOF properly when read or written
1234 * sequentially, but will not effect filesystems.
1236 count = cbp->cb_buf.b_bufsize;
1240 * If all done, "interrupt".
1242 bp->b_resid -= count;
1243 if (bp->b_resid < 0)
1244 panic("ccdiodone: count");
1245 if (bp->b_resid == 0)
1246 ccdintr(&ccd_softc[unit], bp);
1251 ccdioctl(dev, cmd, data, flag, p)
1258 int unit = ccdunit(dev);
1259 int i, j, lookedup = 0, error = 0;
1261 struct ccd_softc *cs;
1262 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1263 struct ccddevice ccd;
1269 cs = &ccd_softc[unit];
1271 bzero(&ccd, sizeof(ccd));
1275 if (cs->sc_flags & CCDF_INITED)
1278 if ((flag & FWRITE) == 0)
1281 if ((error = ccdlock(cs)) != 0)
1284 if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1287 /* Fill in some important bits. */
1288 ccd.ccd_unit = unit;
1289 ccd.ccd_interleave = ccio->ccio_ileave;
1290 if (ccd.ccd_interleave == 0 &&
1291 ((ccio->ccio_flags & CCDF_MIRROR) ||
1292 (ccio->ccio_flags & CCDF_PARITY))) {
1293 printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1294 ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1296 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1297 (ccio->ccio_flags & CCDF_PARITY)) {
1298 printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1299 ccio->ccio_flags &= ~CCDF_PARITY;
1301 if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1302 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1303 printf("ccd%d: mirror/parity forces uniform flag\n",
1305 ccio->ccio_flags |= CCDF_UNIFORM;
1307 ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1310 * Allocate space for and copy in the array of
1311 * componet pathnames and device numbers.
1313 cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1314 M_DEVBUF, M_WAITOK);
1315 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1316 M_DEVBUF, M_WAITOK);
1318 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1319 ccio->ccio_ndisks * sizeof(char **));
1321 free(vpp, M_DEVBUF);
1322 free(cpp, M_DEVBUF);
1328 if (ccddebug & CCDB_INIT)
1329 for (i = 0; i < ccio->ccio_ndisks; ++i)
1330 printf("ccdioctl: component %d: 0x%x\n",
1334 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1336 if (ccddebug & CCDB_INIT)
1337 printf("ccdioctl: lookedup = %d\n", lookedup);
1339 if ((error = ccdlookup(cpp[i], p, &vpp[i])) != 0) {
1340 for (j = 0; j < lookedup; ++j)
1341 (void)vn_close(vpp[j], FREAD|FWRITE,
1343 free(vpp, M_DEVBUF);
1344 free(cpp, M_DEVBUF);
1352 ccd.ccd_ndev = ccio->ccio_ndisks;
1355 * Initialize the ccd. Fills in the softc for us.
1357 if ((error = ccdinit(&ccd, cpp, p)) != 0) {
1358 for (j = 0; j < lookedup; ++j)
1359 (void)vn_close(vpp[j], FREAD|FWRITE,
1361 bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1362 free(vpp, M_DEVBUF);
1363 free(cpp, M_DEVBUF);
1369 * The ccd has been successfully initialized, so
1370 * we can place it into the array and read the disklabel.
1372 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1373 ccio->ccio_unit = unit;
1374 ccio->ccio_size = cs->sc_size;
1375 ccdgetdisklabel(dev);
1382 if ((cs->sc_flags & CCDF_INITED) == 0)
1385 if ((flag & FWRITE) == 0)
1388 if ((error = ccdlock(cs)) != 0)
1391 /* Don't unconfigure if any other partitions are open */
1392 part = ccdpart(dev);
1393 pmask = (1 << part);
1394 if ((cs->sc_openmask & ~pmask)) {
1400 * Free ccd_softc information and clear entry.
1403 /* Close the components and free their pathnames. */
1404 for (i = 0; i < cs->sc_nccdisks; ++i) {
1406 * XXX: this close could potentially fail and
1407 * cause Bad Things. Maybe we need to force
1408 * the close to happen?
1411 if (ccddebug & CCDB_VNODE)
1412 vprint("CCDIOCCLR: vnode info",
1413 cs->sc_cinfo[i].ci_vp);
1415 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1417 free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1420 /* Free interleave index. */
1421 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1422 free(cs->sc_itable[i].ii_index, M_DEVBUF);
1424 /* Free component info and interleave table. */
1425 free(cs->sc_cinfo, M_DEVBUF);
1426 free(cs->sc_itable, M_DEVBUF);
1427 cs->sc_flags &= ~CCDF_INITED;
1430 * Free ccddevice information and clear entry.
1432 free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1433 free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1435 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1438 * And remove the devstat entry.
1440 devstat_remove_entry(&cs->device_stats);
1442 /* This must be atomic. */
1445 bzero(cs, sizeof(struct ccd_softc));
1451 if ((cs->sc_flags & CCDF_INITED) == 0)
1454 *(struct disklabel *)data = cs->sc_label;
1458 if ((cs->sc_flags & CCDF_INITED) == 0)
1461 ((struct partinfo *)data)->disklab = &cs->sc_label;
1462 ((struct partinfo *)data)->part =
1463 &cs->sc_label.d_partitions[ccdpart(dev)];
1468 if ((cs->sc_flags & CCDF_INITED) == 0)
1471 if ((flag & FWRITE) == 0)
1474 if ((error = ccdlock(cs)) != 0)
1477 cs->sc_flags |= CCDF_LABELLING;
1479 error = setdisklabel(&cs->sc_label,
1480 (struct disklabel *)data, 0);
1482 if (cmd == DIOCWDINFO)
1483 error = writedisklabel(CCDLABELDEV(dev),
1487 cs->sc_flags &= ~CCDF_LABELLING;
1496 if ((cs->sc_flags & CCDF_INITED) == 0)
1499 if ((flag & FWRITE) == 0)
1501 if (*(int *)data != 0)
1502 cs->sc_flags |= CCDF_WLABEL;
1504 cs->sc_flags &= ~CCDF_WLABEL;
1518 struct ccd_softc *cs;
1521 if (ccdopen(dev, 0, S_IFCHR, curproc))
1524 cs = &ccd_softc[ccdunit(dev)];
1525 part = ccdpart(dev);
1527 if ((cs->sc_flags & CCDF_INITED) == 0)
1530 if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1533 size = cs->sc_label.d_partitions[part].p_size;
1535 if (ccdclose(dev, 0, S_IFCHR, curproc))
1546 /* Not implemented. */
1551 * Lookup the provided name in the filesystem. If the file exists,
1552 * is a valid block device, and isn't being used by anyone else,
1553 * set *vpp to the file's vnode.
1556 ccdlookup(path, p, vpp)
1559 struct vnode **vpp; /* result */
1561 struct nameidata nd;
1565 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, p);
1566 if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
1568 if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1569 printf("ccdlookup: vn_open error = %d\n", error);
1575 if (vp->v_usecount > 1) {
1580 if (!vn_isdisk(vp, &error))
1584 if (ccddebug & CCDB_VNODE)
1585 vprint("ccdlookup: vnode info", vp);
1588 VOP_UNLOCK(vp, 0, p);
1589 NDFREE(&nd, NDF_ONLY_PNBUF);
1593 VOP_UNLOCK(vp, 0, p);
1594 NDFREE(&nd, NDF_ONLY_PNBUF);
1595 /* vn_close does vrele() for vp */
1596 (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
1601 * Read the disklabel from the ccd. If one is not present, fake one
1605 ccdgetdisklabel(dev)
1608 int unit = ccdunit(dev);
1609 struct ccd_softc *cs = &ccd_softc[unit];
1611 struct disklabel *lp = &cs->sc_label;
1612 struct ccdgeom *ccg = &cs->sc_geom;
1614 bzero(lp, sizeof(*lp));
1616 lp->d_secperunit = cs->sc_size;
1617 lp->d_secsize = ccg->ccg_secsize;
1618 lp->d_nsectors = ccg->ccg_nsectors;
1619 lp->d_ntracks = ccg->ccg_ntracks;
1620 lp->d_ncylinders = ccg->ccg_ncylinders;
1621 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1623 strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1624 lp->d_type = DTYPE_CCD;
1625 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1627 lp->d_interleave = 1;
1630 lp->d_partitions[RAW_PART].p_offset = 0;
1631 lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1632 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1633 lp->d_npartitions = RAW_PART + 1;
1635 lp->d_bbsize = BBSIZE; /* XXX */
1636 lp->d_sbsize = SBSIZE; /* XXX */
1638 lp->d_magic = DISKMAGIC;
1639 lp->d_magic2 = DISKMAGIC;
1640 lp->d_checksum = dkcksum(&cs->sc_label);
1643 * Call the generic disklabel extraction routine.
1645 errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1646 if (errstring != NULL)
1647 ccdmakedisklabel(cs);
1650 /* It's actually extremely common to have unlabeled ccds. */
1651 if (ccddebug & CCDB_LABEL)
1652 if (errstring != NULL)
1653 printf("ccd%d: %s\n", unit, errstring);
1658 * Take care of things one might want to take care of in the event
1659 * that a disklabel isn't present.
1662 ccdmakedisklabel(cs)
1663 struct ccd_softc *cs;
1665 struct disklabel *lp = &cs->sc_label;
1668 * For historical reasons, if there's no disklabel present
1669 * the raw partition must be marked FS_BSDFFS.
1671 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1673 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1677 * Wait interruptibly for an exclusive lock.
1680 * Several drivers do this; it should be abstracted and made MP-safe.
1684 struct ccd_softc *cs;
1688 while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1689 cs->sc_flags |= CCDF_WANTED;
1690 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1693 cs->sc_flags |= CCDF_LOCKED;
1698 * Unlock and wake up any waiters.
1702 struct ccd_softc *cs;
1705 cs->sc_flags &= ~CCDF_LOCKED;
1706 if ((cs->sc_flags & CCDF_WANTED) != 0) {
1707 cs->sc_flags &= ~CCDF_WANTED;
1715 struct ccdiinfo *ii;
1719 for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1720 printf(" itab[%d]: #dk %d sblk %d soff %d",
1721 ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1722 for (i = 0; i < ii->ii_ndisk; i++)
1723 printf(" %d", ii->ii_index[i]);
1730 /* Local Variables: */
1731 /* c-argdecl-indent: 8 */
1732 /* c-continued-statement-offset: 8 */
1733 /* c-indent-level: 8 */