1 /* $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $ */
2 /* $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.21 2005/12/11 01:54:07 swildner Exp $ */
4 /* $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ */
7 * Copyright (c) 1995 Jason R. Thorpe.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed for the NetBSD Project
22 * 4. The name of the author may not be used to endorse or promote products
23 * derived from this software without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
26 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
27 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
28 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
29 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * Copyright (c) 1988 University of Utah.
40 * Copyright (c) 1990, 1993
41 * The Regents of the University of California. All rights reserved.
43 * This code is derived from software contributed to Berkeley by
44 * the Systems Programming Group of the University of Utah Computer
47 * Redistribution and use in source and binary forms, with or without
48 * modification, are permitted provided that the following conditions
50 * 1. Redistributions of source code must retain the above copyright
51 * notice, this list of conditions and the following disclaimer.
52 * 2. Redistributions in binary form must reproduce the above copyright
53 * notice, this list of conditions and the following disclaimer in the
54 * documentation and/or other materials provided with the distribution.
55 * 3. All advertising materials mentioning features or use of this software
56 * must display the following acknowledgement:
57 * This product includes software developed by the University of
58 * California, Berkeley and its contributors.
59 * 4. Neither the name of the University nor the names of its contributors
60 * may be used to endorse or promote products derived from this software
61 * without specific prior written permission.
63 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
75 * from: Utah $Hdr: cd.c 1.6 90/11/28$
77 * @(#)cd.c 8.2 (Berkeley) 11/16/93
81 * "Concatenated" disk driver.
83 * Dynamic configuration and disklabel support by:
84 * Jason R. Thorpe <thorpej@nas.nasa.gov>
85 * Numerical Aerodynamic Simulation Facility
87 * NASA Ames Research Center
88 * Moffett Field, CA 94035
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/kernel.h>
96 #include <sys/module.h>
99 #include <sys/malloc.h>
100 #include <sys/nlookup.h>
101 #include <sys/conf.h>
102 #include <sys/stat.h>
103 #include <sys/sysctl.h>
104 #include <sys/disklabel.h>
105 #include <vfs/ufs/fs.h>
106 #include <sys/devicestat.h>
107 #include <sys/fcntl.h>
108 #include <sys/vnode.h>
109 #include <sys/buf2.h>
111 #include <sys/ccdvar.h>
113 #include <sys/thread2.h>
115 #include <vm/vm_zone.h>
117 #if defined(CCDDEBUG) && !defined(DEBUG)
122 #define CCDB_FOLLOW 0x01
123 #define CCDB_INIT 0x02
125 #define CCDB_LABEL 0x08
126 #define CCDB_VNODE 0x10
127 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
129 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
133 #define ccdunit(x) dkunit(x)
134 #define ccdpart(x) dkpart(x)
137 This is how mirroring works (only writes are special):
139 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
140 linked together by the cb_mirror field. "cb_pflags &
141 CCDPF_MIRROR_DONE" is set to 0 on both of them.
143 When a component returns to ccdiodone(), it checks if "cb_pflags &
144 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
145 flag and returns. If it is, it means its partner has already
146 returned, so it will go to the regular cleanup.
151 struct buf cb_buf; /* new I/O buf */
152 struct buf *cb_obp; /* ptr. to original I/O buf */
153 struct ccdbuf *cb_freenext; /* free list link */
154 int cb_unit; /* target unit */
155 int cb_comp; /* target component */
156 int cb_pflags; /* mirror/parity status flag */
157 struct ccdbuf *cb_mirror; /* mirror counterpart */
160 /* bits in cb_pflags */
161 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
163 #define CCDLABELDEV(dev) \
164 (make_sub_dev(dev, dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
166 static d_open_t ccdopen;
167 static d_close_t ccdclose;
168 static d_strategy_t ccdstrategy;
169 static d_ioctl_t ccdioctl;
170 static d_dump_t ccddump;
171 static d_psize_t ccdsize;
173 #define NCCDFREEHIWAT 16
175 #define CDEV_MAJOR 74
177 static struct cdevsw ccd_cdevsw = {
179 /* maj */ CDEV_MAJOR,
185 /* close */ ccdclose,
187 /* write */ physwrite,
188 /* ioctl */ ccdioctl,
191 /* strategy */ ccdstrategy,
196 /* called during module initialization */
197 static void ccdattach (void);
198 static int ccd_modevent (module_t, int, void *);
200 /* called by biodone() at interrupt time */
201 static void ccdiodone (struct ccdbuf *cbp);
203 static void ccdstart (struct ccd_softc *, struct buf *);
204 static void ccdinterleave (struct ccd_softc *, int);
205 static void ccdintr (struct ccd_softc *, struct buf *);
206 static int ccdinit (struct ccddevice *, char **, struct thread *);
207 static int ccdlookup (char *, struct thread *td, struct vnode **);
208 static void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
209 struct buf *, daddr_t, caddr_t, long);
210 static void ccdgetdisklabel (dev_t);
211 static void ccdmakedisklabel (struct ccd_softc *);
212 static int ccdlock (struct ccd_softc *);
213 static void ccdunlock (struct ccd_softc *);
216 static void printiinfo (struct ccdiinfo *);
219 /* Non-private for the benefit of libkvm. */
220 struct ccd_softc *ccd_softc;
221 struct ccddevice *ccddevs;
222 struct ccdbuf *ccdfreebufs;
223 static int numccdfreebufs;
224 static int numccd = 0;
227 * getccdbuf() - Allocate and zero a ccd buffer.
229 * This routine is called at splbio().
234 getccdbuf(struct ccdbuf *cpy)
239 * Allocate from freelist or malloc as necessary
241 if ((cbp = ccdfreebufs) != NULL) {
242 ccdfreebufs = cbp->cb_freenext;
245 cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
249 * Used by mirroring code
252 bcopy(cpy, cbp, sizeof(struct ccdbuf));
254 bzero(cbp, sizeof(struct ccdbuf));
257 * independant struct buf initialization
259 LIST_INIT(&cbp->cb_buf.b_dep);
260 BUF_LOCKINIT(&cbp->cb_buf);
261 BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
262 BUF_KERNPROC(&cbp->cb_buf);
268 * putccdbuf() - Free a ccd buffer.
270 * This routine is called at splbio().
275 putccdbuf(struct ccdbuf *cbp)
277 BUF_UNLOCK(&cbp->cb_buf);
278 BUF_LOCKFREE(&cbp->cb_buf);
280 if (numccdfreebufs < NCCDFREEHIWAT) {
281 cbp->cb_freenext = ccdfreebufs;
285 free((caddr_t)cbp, M_DEVBUF);
291 * Number of blocks to untouched in front of a component partition.
292 * This is to avoid violating its disklabel area when it starts at the
293 * beginning of the slice.
295 #if !defined(CCD_OFFSET)
296 #define CCD_OFFSET 16
300 * Called by main() during pseudo-device attachment. All we need
301 * to do is allocate enough space for devices to be configured later, and
311 printf("ccd0-%d: Concatenated disk drivers\n", num-1);
313 printf("ccd0: Concatenated disk driver\n");
315 ccd_softc = malloc(num * sizeof(struct ccd_softc), M_DEVBUF,
317 ccddevs = malloc(num * sizeof(struct ccddevice), M_DEVBUF,
321 cdevsw_add(&ccd_cdevsw, 0, 0);
322 /* XXX: is this necessary? */
323 for (i = 0; i < numccd; ++i)
324 ccddevs[i].ccd_dk = -1;
328 ccd_modevent(module_t mod, int type, void *data)
338 printf("ccd0: Unload not supported!\n");
342 default: /* MOD_SHUTDOWN etc */
348 DEV_MODULE(ccd, ccd_modevent, NULL);
351 ccdinit(struct ccddevice *ccd, char **cpaths, struct thread *td)
353 struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
354 struct ccdcinfo *ci = NULL; /* XXX */
360 struct partinfo dpart;
361 struct ccdgeom *ccg = &cs->sc_geom;
362 char tmppath[MAXPATHLEN];
366 KKASSERT(td->td_proc);
367 cred = td->td_proc->p_ucred;
370 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
371 printf("ccdinit: unit %d\n", ccd->ccd_unit);
375 cs->sc_ileave = ccd->ccd_interleave;
376 cs->sc_nccdisks = ccd->ccd_ndev;
378 /* Allocate space for the component info. */
379 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
383 * Verify that each component piece exists and record
384 * relevant information about it.
388 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
389 vp = ccd->ccd_vpp[ix];
390 ci = &cs->sc_cinfo[ix];
394 * Copy in the pathname of the component.
396 bzero(tmppath, sizeof(tmppath)); /* sanity */
397 if ((error = copyinstr(cpaths[ix], tmppath,
398 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
400 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
401 printf("ccd%d: can't copy path, error = %d\n",
402 ccd->ccd_unit, error);
406 ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
407 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
409 ci->ci_dev = vn_todev(vp);
412 * Get partition information for the component.
414 if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
415 FREAD, cred, td)) != 0) {
417 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
418 printf("ccd%d: %s: ioctl failed, error = %d\n",
419 ccd->ccd_unit, ci->ci_path, error);
423 if (dpart.part->p_fstype == FS_BSDFFS) {
425 ((dpart.disklab->d_secsize > maxsecsize) ?
426 dpart.disklab->d_secsize : maxsecsize);
427 size = dpart.part->p_size - CCD_OFFSET;
430 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
431 printf("ccd%d: %s: incorrect partition type\n",
432 ccd->ccd_unit, ci->ci_path);
439 * Calculate the size, truncating to an interleave
440 * boundary if necessary.
443 if (cs->sc_ileave > 1)
444 size -= size % cs->sc_ileave;
448 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
449 printf("ccd%d: %s: size == 0\n",
450 ccd->ccd_unit, ci->ci_path);
456 if (minsize == 0 || size < minsize)
463 * Don't allow the interleave to be smaller than
464 * the biggest component sector.
466 if ((cs->sc_ileave > 0) &&
467 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
469 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
470 printf("ccd%d: interleave must be at least %d\n",
471 ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
478 * If uniform interleave is desired set all sizes to that of
479 * the smallest component. This will guarentee that a single
480 * interleave table is generated.
482 * Lost space must be taken into account when calculating the
483 * overall size. Half the space is lost when CCDF_MIRROR is
484 * specified. One disk is lost when CCDF_PARITY is specified.
486 if (ccd->ccd_flags & CCDF_UNIFORM) {
487 for (ci = cs->sc_cinfo;
488 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
489 ci->ci_size = minsize;
491 if (ccd->ccd_flags & CCDF_MIRROR) {
493 * Check to see if an even number of components
494 * have been specified. The interleave must also
495 * be non-zero in order for us to be able to
496 * guarentee the topology.
498 if (cs->sc_nccdisks % 2) {
499 printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
503 if (cs->sc_ileave == 0) {
504 printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
508 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
509 } else if (ccd->ccd_flags & CCDF_PARITY) {
510 cs->sc_size = (cs->sc_nccdisks-1) * minsize;
512 if (cs->sc_ileave == 0) {
513 printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
517 cs->sc_size = cs->sc_nccdisks * minsize;
522 * Construct the interleave table.
524 ccdinterleave(cs, ccd->ccd_unit);
527 * Create pseudo-geometry based on 1MB cylinders. It's
530 ccg->ccg_secsize = maxsecsize;
531 ccg->ccg_ntracks = 1;
532 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
533 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
536 * Add an devstat entry for this device.
538 devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
539 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
540 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
541 DEVSTAT_PRIORITY_ARRAY);
543 cs->sc_flags |= CCDF_INITED;
544 cs->sc_cflags = ccd->ccd_flags; /* So we can find out later... */
545 cs->sc_unit = ccd->ccd_unit;
548 while (ci > cs->sc_cinfo) {
550 free(ci->ci_path, M_DEVBUF);
552 free(cs->sc_cinfo, M_DEVBUF);
557 ccdinterleave(struct ccd_softc *cs, int unit)
559 struct ccdcinfo *ci, *smallci;
566 if (ccddebug & CCDB_INIT)
567 printf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
571 * Allocate an interleave table. The worst case occurs when each
572 * of N disks is of a different size, resulting in N interleave
575 * Chances are this is too big, but we don't care.
577 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
578 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF, M_WAITOK);
579 bzero((caddr_t)cs->sc_itable, size);
582 * Trivial case: no interleave (actually interleave of disk size).
583 * Each table entry represents a single component in its entirety.
585 * An interleave of 0 may not be used with a mirror or parity setup.
587 if (cs->sc_ileave == 0) {
591 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
592 /* Allocate space for ii_index. */
593 ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
595 ii->ii_startblk = bn;
597 ii->ii_index[0] = ix;
598 bn += cs->sc_cinfo[ix].ci_size;
603 if (ccddebug & CCDB_INIT)
604 printiinfo(cs->sc_itable);
610 * The following isn't fast or pretty; it doesn't have to be.
614 for (ii = cs->sc_itable; ; ii++) {
616 * Allocate space for ii_index. We might allocate more then
619 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
623 * Locate the smallest of the remaining components
626 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
628 if (ci->ci_size > size &&
630 ci->ci_size < smallci->ci_size)) {
636 * Nobody left, all done
638 if (smallci == NULL) {
644 * Record starting logical block using an sc_ileave blocksize.
646 ii->ii_startblk = bn / cs->sc_ileave;
649 * Record starting comopnent block using an sc_ileave
650 * blocksize. This value is relative to the beginning of
653 ii->ii_startoff = lbn;
656 * Determine how many disks take part in this interleave
657 * and record their indices.
660 for (ci = cs->sc_cinfo;
661 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
662 if (ci->ci_size >= smallci->ci_size) {
663 ii->ii_index[ix++] = ci - cs->sc_cinfo;
667 bn += ix * (smallci->ci_size - size);
668 lbn = smallci->ci_size / cs->sc_ileave;
669 size = smallci->ci_size;
672 if (ccddebug & CCDB_INIT)
673 printiinfo(cs->sc_itable);
679 ccdopen(dev_t dev, int flags, int fmt, d_thread_t *td)
681 int unit = ccdunit(dev);
682 struct ccd_softc *cs;
683 struct disklabel *lp;
684 int error = 0, part, pmask;
687 if (ccddebug & CCDB_FOLLOW)
688 printf("ccdopen(%x, %x)\n", dev, flags);
692 cs = &ccd_softc[unit];
694 if ((error = ccdlock(cs)) != 0)
703 * If we're initialized, check to see if there are any other
704 * open partitions. If not, then it's safe to update
705 * the in-core disklabel.
707 if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
708 ccdgetdisklabel(dev);
710 /* Check that the partition exists. */
711 if (part != RAW_PART && ((part >= lp->d_npartitions) ||
712 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
717 cs->sc_openmask |= pmask;
725 ccdclose(dev_t dev, int flags, int fmt, d_thread_t *td)
727 int unit = ccdunit(dev);
728 struct ccd_softc *cs;
732 if (ccddebug & CCDB_FOLLOW)
733 printf("ccdclose(%x, %x)\n", dev, flags);
738 cs = &ccd_softc[unit];
740 if ((error = ccdlock(cs)) != 0)
745 /* ...that much closer to allowing unconfiguration... */
746 cs->sc_openmask &= ~(1 << part);
752 ccdstrategy(struct buf *bp)
754 int unit = ccdunit(bp->b_dev);
755 struct ccd_softc *cs = &ccd_softc[unit];
757 struct disklabel *lp;
760 if (ccddebug & CCDB_FOLLOW)
761 printf("ccdstrategy(%x): unit %d\n", bp, unit);
763 if ((cs->sc_flags & CCDF_INITED) == 0) {
765 bp->b_flags |= B_ERROR;
769 /* If it's a nil transfer, wake up the top half now. */
770 if (bp->b_bcount == 0)
776 * Do bounds checking and adjust transfer. If there's an
777 * error, the bounds check will flag that for us.
779 wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
780 if (ccdpart(bp->b_dev) != RAW_PART) {
781 if (bounds_check_with_label(bp, lp, wlabel) <= 0)
784 int pbn; /* in sc_secsize chunks */
785 long sz; /* in sc_secsize chunks */
787 pbn = bp->b_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
788 sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
791 * If out of bounds return an error. If at the EOF point,
792 * simply read or write less.
795 if (pbn < 0 || pbn >= cs->sc_size) {
796 bp->b_resid = bp->b_bcount;
797 if (pbn != cs->sc_size) {
798 bp->b_error = EINVAL;
799 bp->b_flags |= B_ERROR | B_INVAL;
805 * If the request crosses EOF, truncate the request.
807 if (pbn + sz > cs->sc_size) {
808 bp->b_bcount = (cs->sc_size - pbn) *
809 cs->sc_geom.ccg_secsize;
813 bp->b_resid = bp->b_bcount;
827 ccdstart(struct ccd_softc *cs, struct buf *bp)
830 struct ccdbuf *cbp[4];
831 /* XXX! : 2 reads and 2 writes for RAID 4/5 */
834 struct partition *pp;
837 if (ccddebug & CCDB_FOLLOW)
838 printf("ccdstart(%x, %x)\n", cs, bp);
841 /* Record the transaction start */
842 devstat_start_transaction(&cs->device_stats);
845 * Translate the partition-relative block number to an absolute.
848 if (ccdpart(bp->b_dev) != RAW_PART) {
849 pp = &cs->sc_label.d_partitions[ccdpart(bp->b_dev)];
854 * Allocate component buffers and fire off the requests
857 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
858 ccdbuffer(cbp, cs, bp, bn, addr, bcount);
859 rcount = cbp[0]->cb_buf.b_bcount;
861 if (cs->sc_cflags & CCDF_MIRROR) {
863 * Mirroring. Writes go to both disks, reads are
864 * taken from whichever disk seems most appropriate.
866 * We attempt to localize reads to the disk whos arm
867 * is nearest the read request. We ignore seeks due
868 * to writes when making this determination and we
869 * also try to avoid hogging.
871 if ((cbp[0]->cb_buf.b_flags & B_READ) == 0) {
872 cbp[0]->cb_buf.b_vp->v_numoutput++;
873 cbp[1]->cb_buf.b_vp->v_numoutput++;
874 VOP_STRATEGY(cbp[0]->cb_buf.b_vp,
876 VOP_STRATEGY(cbp[1]->cb_buf.b_vp,
879 int pick = cs->sc_pick;
880 daddr_t range = cs->sc_size / 16;
882 if (bn < cs->sc_blk[pick] - range ||
883 bn > cs->sc_blk[pick] + range
885 cs->sc_pick = pick = 1 - pick;
887 cs->sc_blk[pick] = bn + btodb(rcount);
888 VOP_STRATEGY(cbp[pick]->cb_buf.b_vp,
895 if ((cbp[0]->cb_buf.b_flags & B_READ) == 0)
896 cbp[0]->cb_buf.b_vp->v_numoutput++;
897 VOP_STRATEGY(cbp[0]->cb_buf.b_vp, &cbp[0]->cb_buf);
905 * Build a component buffer header.
908 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct buf *bp, daddr_t bn,
909 caddr_t addr, long bcount)
911 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
917 if (ccddebug & CCDB_IO)
918 printf("ccdbuffer(%x, %x, %d, %x, %d)\n",
919 cs, bp, bn, addr, bcount);
922 * Determine which component bn falls in.
927 if (cs->sc_ileave == 0) {
929 * Serially concatenated and neither a mirror nor a parity
930 * config. This is a special case.
935 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
943 * Calculate cbn, the logical superblock (sc_ileave chunks),
944 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
947 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
948 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
951 * Figure out which interleave table to use.
953 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
954 if (ii->ii_startblk > cbn)
960 * off is the logical superblock relative to the beginning
961 * of this interleave block.
963 off = cbn - ii->ii_startblk;
966 * We must calculate which disk component to use (ccdisk),
967 * and recalculate cbn to be the superblock relative to
968 * the beginning of the component. This is typically done by
969 * adding 'off' and ii->ii_startoff together. However, 'off'
970 * must typically be divided by the number of components in
971 * this interleave array to be properly convert it from a
972 * CCD-relative logical superblock number to a
973 * component-relative superblock number.
975 if (ii->ii_ndisk == 1) {
977 * When we have just one disk, it can't be a mirror
978 * or a parity config.
980 ccdisk = ii->ii_index[0];
981 cbn = ii->ii_startoff + off;
983 if (cs->sc_cflags & CCDF_MIRROR) {
985 * We have forced a uniform mapping, resulting
986 * in a single interleave array. We double
987 * up on the first half of the available
988 * components and our mirror is in the second
989 * half. This only works with a single
990 * interleave array because doubling up
991 * doubles the number of sectors, so there
992 * cannot be another interleave array because
993 * the next interleave array's calculations
996 int ndisk2 = ii->ii_ndisk / 2;
997 ccdisk = ii->ii_index[off % ndisk2];
998 cbn = ii->ii_startoff + off / ndisk2;
999 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1000 } else if (cs->sc_cflags & CCDF_PARITY) {
1002 * XXX not implemented yet
1004 int ndisk2 = ii->ii_ndisk - 1;
1005 ccdisk = ii->ii_index[off % ndisk2];
1006 cbn = ii->ii_startoff + off / ndisk2;
1007 if (cbn % ii->ii_ndisk <= ccdisk)
1010 ccdisk = ii->ii_index[off % ii->ii_ndisk];
1011 cbn = ii->ii_startoff + off / ii->ii_ndisk;
1015 ci = &cs->sc_cinfo[ccdisk];
1018 * Convert cbn from a superblock to a normal block so it
1019 * can be used to calculate (along with cboff) the normal
1020 * block index into this particular disk.
1022 cbn *= cs->sc_ileave;
1026 * Fill in the component buf structure.
1028 cbp = getccdbuf(NULL);
1029 cbp->cb_buf.b_flags = bp->b_flags;
1030 cbp->cb_buf.b_iodone = (void (*)(struct buf *))ccdiodone;
1031 cbp->cb_buf.b_dev = ci->ci_dev; /* XXX */
1032 cbp->cb_buf.b_blkno = cbn + cboff + CCD_OFFSET;
1033 cbp->cb_buf.b_offset = dbtob(cbn + cboff + CCD_OFFSET);
1034 cbp->cb_buf.b_data = addr;
1035 cbp->cb_buf.b_vp = ci->ci_vp;
1036 if (cs->sc_ileave == 0)
1037 cbc = dbtob((off_t)(ci->ci_size - cbn));
1039 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1040 cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1041 cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1044 * context for ccdiodone
1047 cbp->cb_unit = cs - ccd_softc;
1048 cbp->cb_comp = ci - cs->sc_cinfo;
1051 if (ccddebug & CCDB_IO)
1052 printf(" dev %x(u%d): cbp %x bn %d addr %x bcnt %d\n",
1053 ci->ci_dev, ci-cs->sc_cinfo, cbp, cbp->cb_buf.b_blkno,
1054 cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1059 * Note: both I/O's setup when reading from mirror, but only one
1062 if (cs->sc_cflags & CCDF_MIRROR) {
1063 /* mirror, setup second I/O */
1064 cbp = getccdbuf(cb[0]);
1065 cbp->cb_buf.b_dev = ci2->ci_dev;
1066 cbp->cb_buf.b_vp = ci2->ci_vp;
1067 cbp->cb_comp = ci2 - cs->sc_cinfo;
1069 /* link together the ccdbuf's and clear "mirror done" flag */
1070 cb[0]->cb_mirror = cb[1];
1071 cb[1]->cb_mirror = cb[0];
1072 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1073 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1078 ccdintr(struct ccd_softc *cs, struct buf *bp)
1081 if (ccddebug & CCDB_FOLLOW)
1082 printf("ccdintr(%x, %x)\n", cs, bp);
1085 * Request is done for better or worse, wakeup the top half.
1087 if (bp->b_flags & B_ERROR)
1088 bp->b_resid = bp->b_bcount;
1089 devstat_end_transaction_buf(&cs->device_stats, bp);
1094 * Called at interrupt time.
1095 * Mark the component as done and if all components are done,
1096 * take a ccd interrupt.
1099 ccdiodone(struct ccdbuf *cbp)
1101 struct buf *bp = cbp->cb_obp;
1102 int unit = cbp->cb_unit;
1107 if (ccddebug & CCDB_FOLLOW)
1108 printf("ccdiodone(%x)\n", cbp);
1109 if (ccddebug & CCDB_IO) {
1110 printf("ccdiodone: bp %x bcount %d resid %d\n",
1111 bp, bp->b_bcount, bp->b_resid);
1112 printf(" dev %x(u%d), cbp %x bn %d addr %x bcnt %d\n",
1113 cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1114 cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
1115 cbp->cb_buf.b_bcount);
1119 * If an error occured, report it. If this is a mirrored
1120 * configuration and the first of two possible reads, do not
1121 * set the error in the bp yet because the second read may
1125 if (cbp->cb_buf.b_flags & B_ERROR) {
1126 const char *msg = "";
1128 if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1129 (cbp->cb_buf.b_flags & B_READ) &&
1130 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1132 * We will try our read on the other disk down
1133 * below, also reverse the default pick so if we
1134 * are doing a scan we do not keep hitting the
1137 struct ccd_softc *cs = &ccd_softc[unit];
1139 msg = ", trying other disk";
1140 cs->sc_pick = 1 - cs->sc_pick;
1141 cs->sc_blk[cs->sc_pick] = bp->b_blkno;
1143 bp->b_flags |= B_ERROR;
1144 bp->b_error = cbp->cb_buf.b_error ?
1145 cbp->cb_buf.b_error : EIO;
1147 printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1148 unit, bp->b_error, cbp->cb_comp,
1149 (int)cbp->cb_buf.b_blkno, bp->b_blkno, msg);
1153 * Process mirror. If we are writing, I/O has been initiated on both
1154 * buffers and we fall through only after both are finished.
1156 * If we are reading only one I/O is initiated at a time. If an
1157 * error occurs we initiate the second I/O and return, otherwise
1158 * we free the second I/O without initiating it.
1161 if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1162 if ((cbp->cb_buf.b_flags & B_READ) == 0) {
1164 * When writing, handshake with the second buffer
1165 * to determine when both are done. If both are not
1166 * done, return here.
1168 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1169 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1176 * When reading, either dispose of the second buffer
1177 * or initiate I/O on the second buffer if an error
1178 * occured with this one.
1180 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1181 if (cbp->cb_buf.b_flags & B_ERROR) {
1182 cbp->cb_mirror->cb_pflags |=
1185 cbp->cb_mirror->cb_buf.b_vp,
1186 &cbp->cb_mirror->cb_buf
1192 putccdbuf(cbp->cb_mirror);
1200 * use b_bufsize to determine how big the original request was rather
1201 * then b_bcount, because b_bcount may have been truncated for EOF.
1203 * XXX We check for an error, but we do not test the resid for an
1204 * aligned EOF condition. This may result in character & block
1205 * device access not recognizing EOF properly when read or written
1206 * sequentially, but will not effect filesystems.
1208 count = cbp->cb_buf.b_bufsize;
1212 * If all done, "interrupt".
1214 bp->b_resid -= count;
1215 if (bp->b_resid < 0)
1216 panic("ccdiodone: count");
1217 if (bp->b_resid == 0)
1218 ccdintr(&ccd_softc[unit], bp);
1223 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, d_thread_t *td)
1225 int unit = ccdunit(dev);
1226 int i, j, lookedup = 0, error = 0;
1228 struct ccd_softc *cs;
1229 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1230 struct ccddevice ccd;
1235 KKASSERT(td->td_proc != NULL);
1236 cred = td->td_proc->p_ucred;
1240 cs = &ccd_softc[unit];
1242 bzero(&ccd, sizeof(ccd));
1246 if (cs->sc_flags & CCDF_INITED)
1249 if ((flag & FWRITE) == 0)
1252 if ((error = ccdlock(cs)) != 0)
1255 if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1258 /* Fill in some important bits. */
1259 ccd.ccd_unit = unit;
1260 ccd.ccd_interleave = ccio->ccio_ileave;
1261 if (ccd.ccd_interleave == 0 &&
1262 ((ccio->ccio_flags & CCDF_MIRROR) ||
1263 (ccio->ccio_flags & CCDF_PARITY))) {
1264 printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1265 ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1267 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1268 (ccio->ccio_flags & CCDF_PARITY)) {
1269 printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1270 ccio->ccio_flags &= ~CCDF_PARITY;
1272 if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1273 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1274 printf("ccd%d: mirror/parity forces uniform flag\n",
1276 ccio->ccio_flags |= CCDF_UNIFORM;
1278 ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1281 * Allocate space for and copy in the array of
1282 * componet pathnames and device numbers.
1284 cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1285 M_DEVBUF, M_WAITOK);
1286 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1287 M_DEVBUF, M_WAITOK);
1289 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1290 ccio->ccio_ndisks * sizeof(char **));
1292 free(vpp, M_DEVBUF);
1293 free(cpp, M_DEVBUF);
1299 if (ccddebug & CCDB_INIT)
1300 for (i = 0; i < ccio->ccio_ndisks; ++i)
1301 printf("ccdioctl: component %d: 0x%x\n",
1305 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1307 if (ccddebug & CCDB_INIT)
1308 printf("ccdioctl: lookedup = %d\n", lookedup);
1310 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1311 for (j = 0; j < lookedup; ++j)
1312 (void)vn_close(vpp[j], FREAD|FWRITE, td);
1313 free(vpp, M_DEVBUF);
1314 free(cpp, M_DEVBUF);
1322 ccd.ccd_ndev = ccio->ccio_ndisks;
1325 * Initialize the ccd. Fills in the softc for us.
1327 if ((error = ccdinit(&ccd, cpp, td)) != 0) {
1328 for (j = 0; j < lookedup; ++j)
1329 (void)vn_close(vpp[j], FREAD|FWRITE, td);
1330 bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1331 free(vpp, M_DEVBUF);
1332 free(cpp, M_DEVBUF);
1338 * The ccd has been successfully initialized, so
1339 * we can place it into the array and read the disklabel.
1341 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1342 ccio->ccio_unit = unit;
1343 ccio->ccio_size = cs->sc_size;
1344 ccdgetdisklabel(dev);
1351 if ((cs->sc_flags & CCDF_INITED) == 0)
1354 if ((flag & FWRITE) == 0)
1357 if ((error = ccdlock(cs)) != 0)
1360 /* Don't unconfigure if any other partitions are open */
1361 part = ccdpart(dev);
1362 pmask = (1 << part);
1363 if ((cs->sc_openmask & ~pmask)) {
1369 * Free ccd_softc information and clear entry.
1372 /* Close the components and free their pathnames. */
1373 for (i = 0; i < cs->sc_nccdisks; ++i) {
1375 * XXX: this close could potentially fail and
1376 * cause Bad Things. Maybe we need to force
1377 * the close to happen?
1380 if (ccddebug & CCDB_VNODE)
1381 vprint("CCDIOCCLR: vnode info",
1382 cs->sc_cinfo[i].ci_vp);
1384 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, td);
1385 free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1388 /* Free interleave index. */
1389 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1390 free(cs->sc_itable[i].ii_index, M_DEVBUF);
1392 /* Free component info and interleave table. */
1393 free(cs->sc_cinfo, M_DEVBUF);
1394 free(cs->sc_itable, M_DEVBUF);
1395 cs->sc_flags &= ~CCDF_INITED;
1398 * Free ccddevice information and clear entry.
1400 free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1401 free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1403 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1406 * And remove the devstat entry.
1408 devstat_remove_entry(&cs->device_stats);
1410 /* This must be atomic. */
1413 bzero(cs, sizeof(struct ccd_softc));
1419 if ((cs->sc_flags & CCDF_INITED) == 0)
1422 *(struct disklabel *)data = cs->sc_label;
1426 if ((cs->sc_flags & CCDF_INITED) == 0)
1429 ((struct partinfo *)data)->disklab = &cs->sc_label;
1430 ((struct partinfo *)data)->part =
1431 &cs->sc_label.d_partitions[ccdpart(dev)];
1436 if ((cs->sc_flags & CCDF_INITED) == 0)
1439 if ((flag & FWRITE) == 0)
1442 if ((error = ccdlock(cs)) != 0)
1445 cs->sc_flags |= CCDF_LABELLING;
1447 error = setdisklabel(&cs->sc_label,
1448 (struct disklabel *)data, 0);
1450 if (cmd == DIOCWDINFO) {
1451 dev_t cdev = CCDLABELDEV(dev);
1452 error = writedisklabel(cdev, &cs->sc_label);
1456 cs->sc_flags &= ~CCDF_LABELLING;
1465 if ((cs->sc_flags & CCDF_INITED) == 0)
1468 if ((flag & FWRITE) == 0)
1470 if (*(int *)data != 0)
1471 cs->sc_flags |= CCDF_WLABEL;
1473 cs->sc_flags &= ~CCDF_WLABEL;
1486 struct ccd_softc *cs;
1489 if (ccdopen(dev, 0, S_IFCHR, curthread))
1492 cs = &ccd_softc[ccdunit(dev)];
1493 part = ccdpart(dev);
1495 if ((cs->sc_flags & CCDF_INITED) == 0)
1498 if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1501 size = cs->sc_label.d_partitions[part].p_size;
1503 if (ccdclose(dev, 0, S_IFCHR, curthread))
1510 ccddump(dev_t dev, u_int count, u_int blkno, u_int secsize)
1512 /* Not implemented. */
1517 * Lookup the provided name in the filesystem. If the file exists,
1518 * is a valid block device, and isn't being used by anyone else,
1519 * set *vpp to the file's vnode.
1522 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1524 struct nlookupdata nd;
1529 KKASSERT(td->td_proc);
1530 cred = td->td_proc->p_ucred;
1533 error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1536 if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1538 if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1539 printf("ccdlookup: vn_open error = %d\n", error);
1545 if (vp->v_usecount > 1) {
1550 if (!vn_isdisk(vp, &error))
1554 if (ccddebug & CCDB_VNODE)
1555 vprint("ccdlookup: vnode info", vp);
1558 VOP_UNLOCK(vp, 0, td);
1559 nd.nl_open_vp = NULL;
1561 *vpp = vp; /* leave ref intact */
1569 * Read the disklabel from the ccd. If one is not present, fake one
1573 ccdgetdisklabel(dev_t dev)
1575 int unit = ccdunit(dev);
1576 struct ccd_softc *cs = &ccd_softc[unit];
1578 struct disklabel *lp = &cs->sc_label;
1579 struct ccdgeom *ccg = &cs->sc_geom;
1582 bzero(lp, sizeof(*lp));
1584 lp->d_secperunit = cs->sc_size;
1585 lp->d_secsize = ccg->ccg_secsize;
1586 lp->d_nsectors = ccg->ccg_nsectors;
1587 lp->d_ntracks = ccg->ccg_ntracks;
1588 lp->d_ncylinders = ccg->ccg_ncylinders;
1589 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1591 strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1592 lp->d_type = DTYPE_CCD;
1593 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1595 lp->d_interleave = 1;
1598 lp->d_partitions[RAW_PART].p_offset = 0;
1599 lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1600 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1601 lp->d_npartitions = RAW_PART + 1;
1603 lp->d_bbsize = BBSIZE; /* XXX */
1604 lp->d_sbsize = SBSIZE; /* XXX */
1606 lp->d_magic = DISKMAGIC;
1607 lp->d_magic2 = DISKMAGIC;
1608 lp->d_checksum = dkcksum(&cs->sc_label);
1611 * Call the generic disklabel extraction routine.
1613 cdev = CCDLABELDEV(dev);
1614 errstring = readdisklabel(cdev, &cs->sc_label);
1615 if (errstring != NULL)
1616 ccdmakedisklabel(cs);
1619 /* It's actually extremely common to have unlabeled ccds. */
1620 if (ccddebug & CCDB_LABEL)
1621 if (errstring != NULL)
1622 printf("ccd%d: %s\n", unit, errstring);
1627 * Take care of things one might want to take care of in the event
1628 * that a disklabel isn't present.
1631 ccdmakedisklabel(struct ccd_softc *cs)
1633 struct disklabel *lp = &cs->sc_label;
1636 * For historical reasons, if there's no disklabel present
1637 * the raw partition must be marked FS_BSDFFS.
1639 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1641 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1645 * Wait interruptibly for an exclusive lock.
1648 * Several drivers do this; it should be abstracted and made MP-safe.
1651 ccdlock(struct ccd_softc *cs)
1655 while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1656 cs->sc_flags |= CCDF_WANTED;
1657 if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1660 cs->sc_flags |= CCDF_LOCKED;
1665 * Unlock and wake up any waiters.
1668 ccdunlock(struct ccd_softc *cs)
1671 cs->sc_flags &= ~CCDF_LOCKED;
1672 if ((cs->sc_flags & CCDF_WANTED) != 0) {
1673 cs->sc_flags &= ~CCDF_WANTED;
1680 printiinfo(struct ccdiinfo *ii)
1684 for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1685 printf(" itab[%d]: #dk %d sblk %d soff %d",
1686 ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1687 for (i = 0; i < ii->ii_ndisk; i++)
1688 printf(" %d", ii->ii_index[i]);
1695 /* Local Variables: */
1696 /* c-argdecl-indent: 8 */
1697 /* c-continued-statement-offset: 8 */
1698 /* c-indent-level: 8 */