1 /* $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $ */
2 /* $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.13 2004/03/01 06:33:13 dillon Exp $ */
4 /* $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ */
7 * Copyright (c) 1995 Jason R. Thorpe.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed for the NetBSD Project
22 * 4. The name of the author may not be used to endorse or promote products
23 * derived from this software without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
26 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
27 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
28 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
29 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * Copyright (c) 1988 University of Utah.
40 * Copyright (c) 1990, 1993
41 * The Regents of the University of California. All rights reserved.
43 * This code is derived from software contributed to Berkeley by
44 * the Systems Programming Group of the University of Utah Computer
47 * Redistribution and use in source and binary forms, with or without
48 * modification, are permitted provided that the following conditions
50 * 1. Redistributions of source code must retain the above copyright
51 * notice, this list of conditions and the following disclaimer.
52 * 2. Redistributions in binary form must reproduce the above copyright
53 * notice, this list of conditions and the following disclaimer in the
54 * documentation and/or other materials provided with the distribution.
55 * 3. All advertising materials mentioning features or use of this software
56 * must display the following acknowledgement:
57 * This product includes software developed by the University of
58 * California, Berkeley and its contributors.
59 * 4. Neither the name of the University nor the names of its contributors
60 * may be used to endorse or promote products derived from this software
61 * without specific prior written permission.
63 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
75 * from: Utah $Hdr: cd.c 1.6 90/11/28$
77 * @(#)cd.c 8.2 (Berkeley) 11/16/93
81 * "Concatenated" disk driver.
83 * Dynamic configuration and disklabel support by:
84 * Jason R. Thorpe <thorpej@nas.nasa.gov>
85 * Numerical Aerodynamic Simulation Facility
87 * NASA Ames Research Center
88 * Moffett Field, CA 94035
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/kernel.h>
96 #include <sys/module.h>
99 #include <sys/malloc.h>
100 #include <sys/namei.h>
101 #include <sys/conf.h>
102 #include <sys/stat.h>
103 #include <sys/sysctl.h>
104 #include <sys/disklabel.h>
105 #include <vfs/ufs/fs.h>
106 #include <sys/devicestat.h>
107 #include <sys/fcntl.h>
108 #include <sys/vnode.h>
109 #include <sys/buf2.h>
111 #include <sys/ccdvar.h>
113 #include <vm/vm_zone.h>
115 #if defined(CCDDEBUG) && !defined(DEBUG)
120 #define CCDB_FOLLOW 0x01
121 #define CCDB_INIT 0x02
123 #define CCDB_LABEL 0x08
124 #define CCDB_VNODE 0x10
125 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
127 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
131 #define ccdunit(x) dkunit(x)
132 #define ccdpart(x) dkpart(x)
135 This is how mirroring works (only writes are special):
137 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
138 linked together by the cb_mirror field. "cb_pflags &
139 CCDPF_MIRROR_DONE" is set to 0 on both of them.
141 When a component returns to ccdiodone(), it checks if "cb_pflags &
142 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
143 flag and returns. If it is, it means its partner has already
144 returned, so it will go to the regular cleanup.
149 struct buf cb_buf; /* new I/O buf */
150 struct buf *cb_obp; /* ptr. to original I/O buf */
151 struct ccdbuf *cb_freenext; /* free list link */
152 int cb_unit; /* target unit */
153 int cb_comp; /* target component */
154 int cb_pflags; /* mirror/parity status flag */
155 struct ccdbuf *cb_mirror; /* mirror counterpart */
158 /* bits in cb_pflags */
159 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
161 #define CCDLABELDEV(dev) \
162 (makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
164 static d_open_t ccdopen;
165 static d_close_t ccdclose;
166 static d_strategy_t ccdstrategy;
167 static d_ioctl_t ccdioctl;
168 static d_dump_t ccddump;
169 static d_psize_t ccdsize;
171 #define NCCDFREEHIWAT 16
173 #define CDEV_MAJOR 74
175 static struct cdevsw ccd_cdevsw = {
177 /* maj */ CDEV_MAJOR,
183 /* close */ ccdclose,
185 /* write */ physwrite,
186 /* ioctl */ ccdioctl,
189 /* strategy */ ccdstrategy,
194 /* called during module initialization */
195 static void ccdattach (void);
196 static int ccd_modevent (module_t, int, void *);
198 /* called by biodone() at interrupt time */
199 static void ccdiodone (struct ccdbuf *cbp);
201 static void ccdstart (struct ccd_softc *, struct buf *);
202 static void ccdinterleave (struct ccd_softc *, int);
203 static void ccdintr (struct ccd_softc *, struct buf *);
204 static int ccdinit (struct ccddevice *, char **, struct thread *);
205 static int ccdlookup (char *, struct thread *td, struct vnode **);
206 static void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
207 struct buf *, daddr_t, caddr_t, long);
208 static void ccdgetdisklabel (dev_t);
209 static void ccdmakedisklabel (struct ccd_softc *);
210 static int ccdlock (struct ccd_softc *);
211 static void ccdunlock (struct ccd_softc *);
214 static void printiinfo (struct ccdiinfo *);
217 /* Non-private for the benefit of libkvm. */
218 struct ccd_softc *ccd_softc;
219 struct ccddevice *ccddevs;
220 struct ccdbuf *ccdfreebufs;
221 static int numccdfreebufs;
222 static int numccd = 0;
225 * getccdbuf() - Allocate and zero a ccd buffer.
227 * This routine is called at splbio().
232 getccdbuf(struct ccdbuf *cpy)
237 * Allocate from freelist or malloc as necessary
239 if ((cbp = ccdfreebufs) != NULL) {
240 ccdfreebufs = cbp->cb_freenext;
243 cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
247 * Used by mirroring code
250 bcopy(cpy, cbp, sizeof(struct ccdbuf));
252 bzero(cbp, sizeof(struct ccdbuf));
255 * independant struct buf initialization
257 LIST_INIT(&cbp->cb_buf.b_dep);
258 BUF_LOCKINIT(&cbp->cb_buf);
259 BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
260 BUF_KERNPROC(&cbp->cb_buf);
266 * putccdbuf() - Free a ccd buffer.
268 * This routine is called at splbio().
273 putccdbuf(struct ccdbuf *cbp)
275 BUF_UNLOCK(&cbp->cb_buf);
276 BUF_LOCKFREE(&cbp->cb_buf);
278 if (numccdfreebufs < NCCDFREEHIWAT) {
279 cbp->cb_freenext = ccdfreebufs;
283 free((caddr_t)cbp, M_DEVBUF);
289 * Number of blocks to untouched in front of a component partition.
290 * This is to avoid violating its disklabel area when it starts at the
291 * beginning of the slice.
293 #if !defined(CCD_OFFSET)
294 #define CCD_OFFSET 16
298 * Called by main() during pseudo-device attachment. All we need
299 * to do is allocate enough space for devices to be configured later, and
309 printf("ccd0-%d: Concatenated disk drivers\n", num-1);
311 printf("ccd0: Concatenated disk driver\n");
313 ccd_softc = (struct ccd_softc *)malloc(num * sizeof(struct ccd_softc),
315 ccddevs = (struct ccddevice *)malloc(num * sizeof(struct ccddevice),
317 if ((ccd_softc == NULL) || (ccddevs == NULL)) {
318 printf("WARNING: no memory for concatenated disks\n");
319 if (ccd_softc != NULL)
320 free(ccd_softc, M_DEVBUF);
322 free(ccddevs, M_DEVBUF);
326 bzero(ccd_softc, num * sizeof(struct ccd_softc));
327 bzero(ccddevs, num * sizeof(struct ccddevice));
329 cdevsw_add(&ccd_cdevsw);
330 /* XXX: is this necessary? */
331 for (i = 0; i < numccd; ++i)
332 ccddevs[i].ccd_dk = -1;
336 ccd_modevent(mod, type, data)
349 printf("ccd0: Unload not supported!\n");
353 default: /* MOD_SHUTDOWN etc */
359 DEV_MODULE(ccd, ccd_modevent, NULL);
362 ccdinit(struct ccddevice *ccd, char **cpaths, struct thread *td)
364 struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
365 struct ccdcinfo *ci = NULL; /* XXX */
371 struct partinfo dpart;
372 struct ccdgeom *ccg = &cs->sc_geom;
373 char tmppath[MAXPATHLEN];
377 KKASSERT(td->td_proc);
378 cred = td->td_proc->p_ucred;
381 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
382 printf("ccdinit: unit %d\n", ccd->ccd_unit);
386 cs->sc_ileave = ccd->ccd_interleave;
387 cs->sc_nccdisks = ccd->ccd_ndev;
389 /* Allocate space for the component info. */
390 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
394 * Verify that each component piece exists and record
395 * relevant information about it.
399 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
400 vp = ccd->ccd_vpp[ix];
401 ci = &cs->sc_cinfo[ix];
405 * Copy in the pathname of the component.
407 bzero(tmppath, sizeof(tmppath)); /* sanity */
408 if ((error = copyinstr(cpaths[ix], tmppath,
409 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
411 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
412 printf("ccd%d: can't copy path, error = %d\n",
413 ccd->ccd_unit, error);
417 ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
418 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
420 ci->ci_dev = vn_todev(vp);
423 * Get partition information for the component.
425 if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
426 FREAD, cred, td)) != 0) {
428 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
429 printf("ccd%d: %s: ioctl failed, error = %d\n",
430 ccd->ccd_unit, ci->ci_path, error);
434 if (dpart.part->p_fstype == FS_BSDFFS) {
436 ((dpart.disklab->d_secsize > maxsecsize) ?
437 dpart.disklab->d_secsize : maxsecsize);
438 size = dpart.part->p_size - CCD_OFFSET;
441 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
442 printf("ccd%d: %s: incorrect partition type\n",
443 ccd->ccd_unit, ci->ci_path);
450 * Calculate the size, truncating to an interleave
451 * boundary if necessary.
454 if (cs->sc_ileave > 1)
455 size -= size % cs->sc_ileave;
459 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
460 printf("ccd%d: %s: size == 0\n",
461 ccd->ccd_unit, ci->ci_path);
467 if (minsize == 0 || size < minsize)
474 * Don't allow the interleave to be smaller than
475 * the biggest component sector.
477 if ((cs->sc_ileave > 0) &&
478 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
480 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
481 printf("ccd%d: interleave must be at least %d\n",
482 ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
489 * If uniform interleave is desired set all sizes to that of
490 * the smallest component. This will guarentee that a single
491 * interleave table is generated.
493 * Lost space must be taken into account when calculating the
494 * overall size. Half the space is lost when CCDF_MIRROR is
495 * specified. One disk is lost when CCDF_PARITY is specified.
497 if (ccd->ccd_flags & CCDF_UNIFORM) {
498 for (ci = cs->sc_cinfo;
499 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
500 ci->ci_size = minsize;
502 if (ccd->ccd_flags & CCDF_MIRROR) {
504 * Check to see if an even number of components
505 * have been specified. The interleave must also
506 * be non-zero in order for us to be able to
507 * guarentee the topology.
509 if (cs->sc_nccdisks % 2) {
510 printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
514 if (cs->sc_ileave == 0) {
515 printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
519 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
520 } else if (ccd->ccd_flags & CCDF_PARITY) {
521 cs->sc_size = (cs->sc_nccdisks-1) * minsize;
523 if (cs->sc_ileave == 0) {
524 printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
528 cs->sc_size = cs->sc_nccdisks * minsize;
533 * Construct the interleave table.
535 ccdinterleave(cs, ccd->ccd_unit);
538 * Create pseudo-geometry based on 1MB cylinders. It's
541 ccg->ccg_secsize = maxsecsize;
542 ccg->ccg_ntracks = 1;
543 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
544 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
547 * Add an devstat entry for this device.
549 devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
550 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
551 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
552 DEVSTAT_PRIORITY_ARRAY);
554 cs->sc_flags |= CCDF_INITED;
555 cs->sc_cflags = ccd->ccd_flags; /* So we can find out later... */
556 cs->sc_unit = ccd->ccd_unit;
559 while (ci > cs->sc_cinfo) {
561 free(ci->ci_path, M_DEVBUF);
563 free(cs->sc_cinfo, M_DEVBUF);
568 ccdinterleave(cs, unit)
569 struct ccd_softc *cs;
572 struct ccdcinfo *ci, *smallci;
579 if (ccddebug & CCDB_INIT)
580 printf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
584 * Allocate an interleave table. The worst case occurs when each
585 * of N disks is of a different size, resulting in N interleave
588 * Chances are this is too big, but we don't care.
590 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
591 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF, M_WAITOK);
592 bzero((caddr_t)cs->sc_itable, size);
595 * Trivial case: no interleave (actually interleave of disk size).
596 * Each table entry represents a single component in its entirety.
598 * An interleave of 0 may not be used with a mirror or parity setup.
600 if (cs->sc_ileave == 0) {
604 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
605 /* Allocate space for ii_index. */
606 ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
608 ii->ii_startblk = bn;
610 ii->ii_index[0] = ix;
611 bn += cs->sc_cinfo[ix].ci_size;
616 if (ccddebug & CCDB_INIT)
617 printiinfo(cs->sc_itable);
623 * The following isn't fast or pretty; it doesn't have to be.
627 for (ii = cs->sc_itable; ; ii++) {
629 * Allocate space for ii_index. We might allocate more then
632 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
636 * Locate the smallest of the remaining components
639 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
641 if (ci->ci_size > size &&
643 ci->ci_size < smallci->ci_size)) {
649 * Nobody left, all done
651 if (smallci == NULL) {
657 * Record starting logical block using an sc_ileave blocksize.
659 ii->ii_startblk = bn / cs->sc_ileave;
662 * Record starting comopnent block using an sc_ileave
663 * blocksize. This value is relative to the beginning of
666 ii->ii_startoff = lbn;
669 * Determine how many disks take part in this interleave
670 * and record their indices.
673 for (ci = cs->sc_cinfo;
674 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
675 if (ci->ci_size >= smallci->ci_size) {
676 ii->ii_index[ix++] = ci - cs->sc_cinfo;
680 bn += ix * (smallci->ci_size - size);
681 lbn = smallci->ci_size / cs->sc_ileave;
682 size = smallci->ci_size;
685 if (ccddebug & CCDB_INIT)
686 printiinfo(cs->sc_itable);
692 ccdopen(dev_t dev, int flags, int fmt, d_thread_t *td)
694 int unit = ccdunit(dev);
695 struct ccd_softc *cs;
696 struct disklabel *lp;
697 int error = 0, part, pmask;
700 if (ccddebug & CCDB_FOLLOW)
701 printf("ccdopen(%x, %x)\n", dev, flags);
705 cs = &ccd_softc[unit];
707 if ((error = ccdlock(cs)) != 0)
716 * If we're initialized, check to see if there are any other
717 * open partitions. If not, then it's safe to update
718 * the in-core disklabel.
720 if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
721 ccdgetdisklabel(dev);
723 /* Check that the partition exists. */
724 if (part != RAW_PART && ((part >= lp->d_npartitions) ||
725 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
730 cs->sc_openmask |= pmask;
738 ccdclose(dev_t dev, int flags, int fmt, d_thread_t *td)
740 int unit = ccdunit(dev);
741 struct ccd_softc *cs;
745 if (ccddebug & CCDB_FOLLOW)
746 printf("ccdclose(%x, %x)\n", dev, flags);
751 cs = &ccd_softc[unit];
753 if ((error = ccdlock(cs)) != 0)
758 /* ...that much closer to allowing unconfiguration... */
759 cs->sc_openmask &= ~(1 << part);
768 int unit = ccdunit(bp->b_dev);
769 struct ccd_softc *cs = &ccd_softc[unit];
772 struct disklabel *lp;
775 if (ccddebug & CCDB_FOLLOW)
776 printf("ccdstrategy(%x): unit %d\n", bp, unit);
778 if ((cs->sc_flags & CCDF_INITED) == 0) {
780 bp->b_flags |= B_ERROR;
784 /* If it's a nil transfer, wake up the top half now. */
785 if (bp->b_bcount == 0)
791 * Do bounds checking and adjust transfer. If there's an
792 * error, the bounds check will flag that for us.
794 wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
795 if (ccdpart(bp->b_dev) != RAW_PART) {
796 if (bounds_check_with_label(bp, lp, wlabel) <= 0)
799 int pbn; /* in sc_secsize chunks */
800 long sz; /* in sc_secsize chunks */
802 pbn = bp->b_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
803 sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
806 * If out of bounds return an error. If at the EOF point,
807 * simply read or write less.
810 if (pbn < 0 || pbn >= cs->sc_size) {
811 bp->b_resid = bp->b_bcount;
812 if (pbn != cs->sc_size) {
813 bp->b_error = EINVAL;
814 bp->b_flags |= B_ERROR | B_INVAL;
820 * If the request crosses EOF, truncate the request.
822 if (pbn + sz > cs->sc_size) {
823 bp->b_bcount = (cs->sc_size - pbn) *
824 cs->sc_geom.ccg_secsize;
828 bp->b_resid = bp->b_bcount;
843 struct ccd_softc *cs;
847 struct ccdbuf *cbp[4];
848 /* XXX! : 2 reads and 2 writes for RAID 4/5 */
851 struct partition *pp;
854 if (ccddebug & CCDB_FOLLOW)
855 printf("ccdstart(%x, %x)\n", cs, bp);
858 /* Record the transaction start */
859 devstat_start_transaction(&cs->device_stats);
862 * Translate the partition-relative block number to an absolute.
865 if (ccdpart(bp->b_dev) != RAW_PART) {
866 pp = &cs->sc_label.d_partitions[ccdpart(bp->b_dev)];
871 * Allocate component buffers and fire off the requests
874 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
875 ccdbuffer(cbp, cs, bp, bn, addr, bcount);
876 rcount = cbp[0]->cb_buf.b_bcount;
878 if (cs->sc_cflags & CCDF_MIRROR) {
880 * Mirroring. Writes go to both disks, reads are
881 * taken from whichever disk seems most appropriate.
883 * We attempt to localize reads to the disk whos arm
884 * is nearest the read request. We ignore seeks due
885 * to writes when making this determination and we
886 * also try to avoid hogging.
888 if ((cbp[0]->cb_buf.b_flags & B_READ) == 0) {
889 cbp[0]->cb_buf.b_vp->v_numoutput++;
890 cbp[1]->cb_buf.b_vp->v_numoutput++;
891 VOP_STRATEGY(cbp[0]->cb_buf.b_vp,
893 VOP_STRATEGY(cbp[1]->cb_buf.b_vp,
896 int pick = cs->sc_pick;
897 daddr_t range = cs->sc_size / 16;
899 if (bn < cs->sc_blk[pick] - range ||
900 bn > cs->sc_blk[pick] + range
902 cs->sc_pick = pick = 1 - pick;
904 cs->sc_blk[pick] = bn + btodb(rcount);
905 VOP_STRATEGY(cbp[pick]->cb_buf.b_vp,
912 if ((cbp[0]->cb_buf.b_flags & B_READ) == 0)
913 cbp[0]->cb_buf.b_vp->v_numoutput++;
914 VOP_STRATEGY(cbp[0]->cb_buf.b_vp, &cbp[0]->cb_buf);
922 * Build a component buffer header.
925 ccdbuffer(cb, cs, bp, bn, addr, bcount)
927 struct ccd_softc *cs;
933 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
939 if (ccddebug & CCDB_IO)
940 printf("ccdbuffer(%x, %x, %d, %x, %d)\n",
941 cs, bp, bn, addr, bcount);
944 * Determine which component bn falls in.
949 if (cs->sc_ileave == 0) {
951 * Serially concatenated and neither a mirror nor a parity
952 * config. This is a special case.
957 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
965 * Calculate cbn, the logical superblock (sc_ileave chunks),
966 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
969 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
970 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
973 * Figure out which interleave table to use.
975 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
976 if (ii->ii_startblk > cbn)
982 * off is the logical superblock relative to the beginning
983 * of this interleave block.
985 off = cbn - ii->ii_startblk;
988 * We must calculate which disk component to use (ccdisk),
989 * and recalculate cbn to be the superblock relative to
990 * the beginning of the component. This is typically done by
991 * adding 'off' and ii->ii_startoff together. However, 'off'
992 * must typically be divided by the number of components in
993 * this interleave array to be properly convert it from a
994 * CCD-relative logical superblock number to a
995 * component-relative superblock number.
997 if (ii->ii_ndisk == 1) {
999 * When we have just one disk, it can't be a mirror
1000 * or a parity config.
1002 ccdisk = ii->ii_index[0];
1003 cbn = ii->ii_startoff + off;
1005 if (cs->sc_cflags & CCDF_MIRROR) {
1007 * We have forced a uniform mapping, resulting
1008 * in a single interleave array. We double
1009 * up on the first half of the available
1010 * components and our mirror is in the second
1011 * half. This only works with a single
1012 * interleave array because doubling up
1013 * doubles the number of sectors, so there
1014 * cannot be another interleave array because
1015 * the next interleave array's calculations
1018 int ndisk2 = ii->ii_ndisk / 2;
1019 ccdisk = ii->ii_index[off % ndisk2];
1020 cbn = ii->ii_startoff + off / ndisk2;
1021 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1022 } else if (cs->sc_cflags & CCDF_PARITY) {
1024 * XXX not implemented yet
1026 int ndisk2 = ii->ii_ndisk - 1;
1027 ccdisk = ii->ii_index[off % ndisk2];
1028 cbn = ii->ii_startoff + off / ndisk2;
1029 if (cbn % ii->ii_ndisk <= ccdisk)
1032 ccdisk = ii->ii_index[off % ii->ii_ndisk];
1033 cbn = ii->ii_startoff + off / ii->ii_ndisk;
1037 ci = &cs->sc_cinfo[ccdisk];
1040 * Convert cbn from a superblock to a normal block so it
1041 * can be used to calculate (along with cboff) the normal
1042 * block index into this particular disk.
1044 cbn *= cs->sc_ileave;
1048 * Fill in the component buf structure.
1050 cbp = getccdbuf(NULL);
1051 cbp->cb_buf.b_flags = bp->b_flags | B_CALL;
1052 cbp->cb_buf.b_iodone = (void (*)(struct buf *))ccdiodone;
1053 cbp->cb_buf.b_dev = ci->ci_dev; /* XXX */
1054 cbp->cb_buf.b_blkno = cbn + cboff + CCD_OFFSET;
1055 cbp->cb_buf.b_offset = dbtob(cbn + cboff + CCD_OFFSET);
1056 cbp->cb_buf.b_data = addr;
1057 cbp->cb_buf.b_vp = ci->ci_vp;
1058 if (cs->sc_ileave == 0)
1059 cbc = dbtob((off_t)(ci->ci_size - cbn));
1061 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1062 cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1063 cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1066 * context for ccdiodone
1069 cbp->cb_unit = cs - ccd_softc;
1070 cbp->cb_comp = ci - cs->sc_cinfo;
1073 if (ccddebug & CCDB_IO)
1074 printf(" dev %x(u%d): cbp %x bn %d addr %x bcnt %d\n",
1075 ci->ci_dev, ci-cs->sc_cinfo, cbp, cbp->cb_buf.b_blkno,
1076 cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1081 * Note: both I/O's setup when reading from mirror, but only one
1084 if (cs->sc_cflags & CCDF_MIRROR) {
1085 /* mirror, setup second I/O */
1086 cbp = getccdbuf(cb[0]);
1087 cbp->cb_buf.b_dev = ci2->ci_dev;
1088 cbp->cb_buf.b_vp = ci2->ci_vp;
1089 cbp->cb_comp = ci2 - cs->sc_cinfo;
1091 /* link together the ccdbuf's and clear "mirror done" flag */
1092 cb[0]->cb_mirror = cb[1];
1093 cb[1]->cb_mirror = cb[0];
1094 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1095 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1101 struct ccd_softc *cs;
1105 if (ccddebug & CCDB_FOLLOW)
1106 printf("ccdintr(%x, %x)\n", cs, bp);
1109 * Request is done for better or worse, wakeup the top half.
1111 if (bp->b_flags & B_ERROR)
1112 bp->b_resid = bp->b_bcount;
1113 devstat_end_transaction_buf(&cs->device_stats, bp);
1118 * Called at interrupt time.
1119 * Mark the component as done and if all components are done,
1120 * take a ccd interrupt.
1126 struct buf *bp = cbp->cb_obp;
1127 int unit = cbp->cb_unit;
1132 if (ccddebug & CCDB_FOLLOW)
1133 printf("ccdiodone(%x)\n", cbp);
1134 if (ccddebug & CCDB_IO) {
1135 printf("ccdiodone: bp %x bcount %d resid %d\n",
1136 bp, bp->b_bcount, bp->b_resid);
1137 printf(" dev %x(u%d), cbp %x bn %d addr %x bcnt %d\n",
1138 cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1139 cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
1140 cbp->cb_buf.b_bcount);
1144 * If an error occured, report it. If this is a mirrored
1145 * configuration and the first of two possible reads, do not
1146 * set the error in the bp yet because the second read may
1150 if (cbp->cb_buf.b_flags & B_ERROR) {
1151 const char *msg = "";
1153 if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1154 (cbp->cb_buf.b_flags & B_READ) &&
1155 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1157 * We will try our read on the other disk down
1158 * below, also reverse the default pick so if we
1159 * are doing a scan we do not keep hitting the
1162 struct ccd_softc *cs = &ccd_softc[unit];
1164 msg = ", trying other disk";
1165 cs->sc_pick = 1 - cs->sc_pick;
1166 cs->sc_blk[cs->sc_pick] = bp->b_blkno;
1168 bp->b_flags |= B_ERROR;
1169 bp->b_error = cbp->cb_buf.b_error ?
1170 cbp->cb_buf.b_error : EIO;
1172 printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1173 unit, bp->b_error, cbp->cb_comp,
1174 (int)cbp->cb_buf.b_blkno, bp->b_blkno, msg);
1178 * Process mirror. If we are writing, I/O has been initiated on both
1179 * buffers and we fall through only after both are finished.
1181 * If we are reading only one I/O is initiated at a time. If an
1182 * error occurs we initiate the second I/O and return, otherwise
1183 * we free the second I/O without initiating it.
1186 if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1187 if ((cbp->cb_buf.b_flags & B_READ) == 0) {
1189 * When writing, handshake with the second buffer
1190 * to determine when both are done. If both are not
1191 * done, return here.
1193 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1194 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1201 * When reading, either dispose of the second buffer
1202 * or initiate I/O on the second buffer if an error
1203 * occured with this one.
1205 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1206 if (cbp->cb_buf.b_flags & B_ERROR) {
1207 cbp->cb_mirror->cb_pflags |=
1210 cbp->cb_mirror->cb_buf.b_vp,
1211 &cbp->cb_mirror->cb_buf
1217 putccdbuf(cbp->cb_mirror);
1225 * use b_bufsize to determine how big the original request was rather
1226 * then b_bcount, because b_bcount may have been truncated for EOF.
1228 * XXX We check for an error, but we do not test the resid for an
1229 * aligned EOF condition. This may result in character & block
1230 * device access not recognizing EOF properly when read or written
1231 * sequentially, but will not effect filesystems.
1233 count = cbp->cb_buf.b_bufsize;
1237 * If all done, "interrupt".
1239 bp->b_resid -= count;
1240 if (bp->b_resid < 0)
1241 panic("ccdiodone: count");
1242 if (bp->b_resid == 0)
1243 ccdintr(&ccd_softc[unit], bp);
1248 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, d_thread_t *td)
1250 int unit = ccdunit(dev);
1251 int i, j, lookedup = 0, error = 0;
1253 struct ccd_softc *cs;
1254 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1255 struct ccddevice ccd;
1260 KKASSERT(td->td_proc != NULL);
1261 cred = td->td_proc->p_ucred;
1265 cs = &ccd_softc[unit];
1267 bzero(&ccd, sizeof(ccd));
1271 if (cs->sc_flags & CCDF_INITED)
1274 if ((flag & FWRITE) == 0)
1277 if ((error = ccdlock(cs)) != 0)
1280 if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1283 /* Fill in some important bits. */
1284 ccd.ccd_unit = unit;
1285 ccd.ccd_interleave = ccio->ccio_ileave;
1286 if (ccd.ccd_interleave == 0 &&
1287 ((ccio->ccio_flags & CCDF_MIRROR) ||
1288 (ccio->ccio_flags & CCDF_PARITY))) {
1289 printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1290 ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1292 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1293 (ccio->ccio_flags & CCDF_PARITY)) {
1294 printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1295 ccio->ccio_flags &= ~CCDF_PARITY;
1297 if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1298 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1299 printf("ccd%d: mirror/parity forces uniform flag\n",
1301 ccio->ccio_flags |= CCDF_UNIFORM;
1303 ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1306 * Allocate space for and copy in the array of
1307 * componet pathnames and device numbers.
1309 cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1310 M_DEVBUF, M_WAITOK);
1311 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1312 M_DEVBUF, M_WAITOK);
1314 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1315 ccio->ccio_ndisks * sizeof(char **));
1317 free(vpp, M_DEVBUF);
1318 free(cpp, M_DEVBUF);
1324 if (ccddebug & CCDB_INIT)
1325 for (i = 0; i < ccio->ccio_ndisks; ++i)
1326 printf("ccdioctl: component %d: 0x%x\n",
1330 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1332 if (ccddebug & CCDB_INIT)
1333 printf("ccdioctl: lookedup = %d\n", lookedup);
1335 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1336 for (j = 0; j < lookedup; ++j)
1337 (void)vn_close(vpp[j], FREAD|FWRITE, td);
1338 free(vpp, M_DEVBUF);
1339 free(cpp, M_DEVBUF);
1347 ccd.ccd_ndev = ccio->ccio_ndisks;
1350 * Initialize the ccd. Fills in the softc for us.
1352 if ((error = ccdinit(&ccd, cpp, td)) != 0) {
1353 for (j = 0; j < lookedup; ++j)
1354 (void)vn_close(vpp[j], FREAD|FWRITE, td);
1355 bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1356 free(vpp, M_DEVBUF);
1357 free(cpp, M_DEVBUF);
1363 * The ccd has been successfully initialized, so
1364 * we can place it into the array and read the disklabel.
1366 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1367 ccio->ccio_unit = unit;
1368 ccio->ccio_size = cs->sc_size;
1369 ccdgetdisklabel(dev);
1376 if ((cs->sc_flags & CCDF_INITED) == 0)
1379 if ((flag & FWRITE) == 0)
1382 if ((error = ccdlock(cs)) != 0)
1385 /* Don't unconfigure if any other partitions are open */
1386 part = ccdpart(dev);
1387 pmask = (1 << part);
1388 if ((cs->sc_openmask & ~pmask)) {
1394 * Free ccd_softc information and clear entry.
1397 /* Close the components and free their pathnames. */
1398 for (i = 0; i < cs->sc_nccdisks; ++i) {
1400 * XXX: this close could potentially fail and
1401 * cause Bad Things. Maybe we need to force
1402 * the close to happen?
1405 if (ccddebug & CCDB_VNODE)
1406 vprint("CCDIOCCLR: vnode info",
1407 cs->sc_cinfo[i].ci_vp);
1409 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, td);
1410 free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1413 /* Free interleave index. */
1414 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1415 free(cs->sc_itable[i].ii_index, M_DEVBUF);
1417 /* Free component info and interleave table. */
1418 free(cs->sc_cinfo, M_DEVBUF);
1419 free(cs->sc_itable, M_DEVBUF);
1420 cs->sc_flags &= ~CCDF_INITED;
1423 * Free ccddevice information and clear entry.
1425 free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1426 free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1428 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1431 * And remove the devstat entry.
1433 devstat_remove_entry(&cs->device_stats);
1435 /* This must be atomic. */
1438 bzero(cs, sizeof(struct ccd_softc));
1444 if ((cs->sc_flags & CCDF_INITED) == 0)
1447 *(struct disklabel *)data = cs->sc_label;
1451 if ((cs->sc_flags & CCDF_INITED) == 0)
1454 ((struct partinfo *)data)->disklab = &cs->sc_label;
1455 ((struct partinfo *)data)->part =
1456 &cs->sc_label.d_partitions[ccdpart(dev)];
1461 if ((cs->sc_flags & CCDF_INITED) == 0)
1464 if ((flag & FWRITE) == 0)
1467 if ((error = ccdlock(cs)) != 0)
1470 cs->sc_flags |= CCDF_LABELLING;
1472 error = setdisklabel(&cs->sc_label,
1473 (struct disklabel *)data, 0);
1475 if (cmd == DIOCWDINFO)
1476 error = writedisklabel(CCDLABELDEV(dev),
1480 cs->sc_flags &= ~CCDF_LABELLING;
1489 if ((cs->sc_flags & CCDF_INITED) == 0)
1492 if ((flag & FWRITE) == 0)
1494 if (*(int *)data != 0)
1495 cs->sc_flags |= CCDF_WLABEL;
1497 cs->sc_flags &= ~CCDF_WLABEL;
1510 struct ccd_softc *cs;
1513 if (ccdopen(dev, 0, S_IFCHR, curthread))
1516 cs = &ccd_softc[ccdunit(dev)];
1517 part = ccdpart(dev);
1519 if ((cs->sc_flags & CCDF_INITED) == 0)
1522 if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1525 size = cs->sc_label.d_partitions[part].p_size;
1527 if (ccdclose(dev, 0, S_IFCHR, curthread))
1538 /* Not implemented. */
1543 * Lookup the provided name in the filesystem. If the file exists,
1544 * is a valid block device, and isn't being used by anyone else,
1545 * set *vpp to the file's vnode.
1548 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1550 struct nameidata nd;
1555 KKASSERT(td->td_proc);
1556 cred = td->td_proc->p_ucred;
1558 NDINIT(&nd, NAMEI_LOOKUP, CNP_FOLLOW, UIO_USERSPACE, path, td);
1559 if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
1561 if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1562 printf("ccdlookup: vn_open error = %d\n", error);
1568 if (vp->v_usecount > 1) {
1573 if (!vn_isdisk(vp, &error))
1577 if (ccddebug & CCDB_VNODE)
1578 vprint("ccdlookup: vnode info", vp);
1581 VOP_UNLOCK(vp, NULL, 0, td);
1582 NDFREE(&nd, NDF_ONLY_PNBUF);
1586 VOP_UNLOCK(vp, NULL, 0, td);
1587 NDFREE(&nd, NDF_ONLY_PNBUF);
1588 /* vn_close does vrele() for vp */
1589 (void)vn_close(vp, FREAD|FWRITE, td);
1594 * Read the disklabel from the ccd. If one is not present, fake one
1598 ccdgetdisklabel(dev)
1601 int unit = ccdunit(dev);
1602 struct ccd_softc *cs = &ccd_softc[unit];
1604 struct disklabel *lp = &cs->sc_label;
1605 struct ccdgeom *ccg = &cs->sc_geom;
1607 bzero(lp, sizeof(*lp));
1609 lp->d_secperunit = cs->sc_size;
1610 lp->d_secsize = ccg->ccg_secsize;
1611 lp->d_nsectors = ccg->ccg_nsectors;
1612 lp->d_ntracks = ccg->ccg_ntracks;
1613 lp->d_ncylinders = ccg->ccg_ncylinders;
1614 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1616 strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1617 lp->d_type = DTYPE_CCD;
1618 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1620 lp->d_interleave = 1;
1623 lp->d_partitions[RAW_PART].p_offset = 0;
1624 lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1625 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1626 lp->d_npartitions = RAW_PART + 1;
1628 lp->d_bbsize = BBSIZE; /* XXX */
1629 lp->d_sbsize = SBSIZE; /* XXX */
1631 lp->d_magic = DISKMAGIC;
1632 lp->d_magic2 = DISKMAGIC;
1633 lp->d_checksum = dkcksum(&cs->sc_label);
1636 * Call the generic disklabel extraction routine.
1638 errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1639 if (errstring != NULL)
1640 ccdmakedisklabel(cs);
1643 /* It's actually extremely common to have unlabeled ccds. */
1644 if (ccddebug & CCDB_LABEL)
1645 if (errstring != NULL)
1646 printf("ccd%d: %s\n", unit, errstring);
1651 * Take care of things one might want to take care of in the event
1652 * that a disklabel isn't present.
1655 ccdmakedisklabel(cs)
1656 struct ccd_softc *cs;
1658 struct disklabel *lp = &cs->sc_label;
1661 * For historical reasons, if there's no disklabel present
1662 * the raw partition must be marked FS_BSDFFS.
1664 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1666 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1670 * Wait interruptibly for an exclusive lock.
1673 * Several drivers do this; it should be abstracted and made MP-safe.
1677 struct ccd_softc *cs;
1681 while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1682 cs->sc_flags |= CCDF_WANTED;
1683 if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1686 cs->sc_flags |= CCDF_LOCKED;
1691 * Unlock and wake up any waiters.
1695 struct ccd_softc *cs;
1698 cs->sc_flags &= ~CCDF_LOCKED;
1699 if ((cs->sc_flags & CCDF_WANTED) != 0) {
1700 cs->sc_flags &= ~CCDF_WANTED;
1708 struct ccdiinfo *ii;
1712 for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1713 printf(" itab[%d]: #dk %d sblk %d soff %d",
1714 ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1715 for (i = 0; i < ii->ii_ndisk; i++)
1716 printf(" %d", ii->ii_index[i]);
1723 /* Local Variables: */
1724 /* c-argdecl-indent: 8 */
1725 /* c-continued-statement-offset: 8 */
1726 /* c-indent-level: 8 */