2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
9 * Copyright (c) 1982, 1986, 1988, 1993
10 * The Regents of the University of California. All rights reserved.
11 * (c) UNIX System Laboratories, Inc.
12 * All or some portions of this file are derived from material licensed
13 * to the University of California by American Telephone and Telegraph
14 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
15 * the permission of UNIX System Laboratories, Inc.
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
20 * 1. Redistributions of source code must retain the above copyright
21 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in the
24 * documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 * must display the following acknowledgement:
27 * This product includes software developed by the University of
28 * California, Berkeley and its contributors.
29 * 4. Neither the name of the University nor the names of its contributors
30 * may be used to endorse or promote products derived from this software
31 * without specific prior written permission.
33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
45 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
46 * $FreeBSD: src/sys/kern/subr_disk.c,v 1.20.2.6 2001/10/05 07:14:57 peter Exp $
47 * $FreeBSD: src/sys/ufs/ufs/ufs_disksubr.c,v 1.44.2.3 2001/03/05 05:42:19 obrien Exp $
48 * $DragonFly: src/sys/kern/subr_disk.c,v 1.8 2004/02/18 06:59:15 dillon Exp $
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/kernel.h>
55 #include <sys/sysctl.h>
58 #include <sys/disklabel.h>
59 #include <sys/diskslice.h>
61 #include <sys/malloc.h>
62 #include <sys/sysctl.h>
63 #include <machine/md_var.h>
64 #include <sys/ctype.h>
65 #include <sys/syslog.h>
66 #include <sys/device.h>
67 #include <sys/msgport.h>
68 #include <sys/msgport2.h>
71 static MALLOC_DEFINE(M_DISK, "disk", "disk data");
73 static d_strategy_t diskstrategy;
74 static d_open_t diskopen;
75 static d_close_t diskclose;
76 static d_ioctl_t diskioctl;
77 static d_psize_t diskpsize;
78 static int disk_putport(lwkt_port_t port, lwkt_msg_t msg);
80 static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist);
83 inherit_raw(dev_t pdev, dev_t dev)
85 dev->si_disk = pdev->si_disk;
86 dev->si_drv1 = pdev->si_drv1;
87 dev->si_drv2 = pdev->si_drv2;
88 dev->si_iosize_max = pdev->si_iosize_max;
89 dev->si_bsize_phys = pdev->si_bsize_phys;
90 dev->si_bsize_best = pdev->si_bsize_best;
94 * Create a slice and unit managed disk. The underlying raw disk device
95 * is specified by cdevsw. We create the device as a managed device by
96 * first creating it normally then overriding the message port with our
97 * own frontend (which will be responsible for assigning pblkno).
100 disk_create(int unit, struct disk *dp, int flags, struct cdevsw *cdevsw)
104 bzero(dp, sizeof(*dp));
105 lwkt_initport(&dp->d_port, NULL); /* intercept port */
106 dp->d_port.mp_putport = disk_putport;
108 dev = makedev(cdevsw->d_maj, 0); /* base device */
110 /* forwarding port */
111 dp->d_fwdport = cdevsw_add_override(cdevsw, &dp->d_port);
114 printf("Creating DISK %s%d\n", cdevsw->d_name, unit);
117 * The whole disk placemarker holds the disk structure.
119 dev = make_dev(cdevsw, dkmakeminor(unit, WHOLE_DISK_SLICE, RAW_PART),
120 UID_ROOT, GID_OPERATOR, 0640, "%s%d", cdevsw->d_name, unit);
123 dp->d_dsflags = flags;
124 LIST_INSERT_HEAD(&disklist, dp, d_list);
129 disk_destroy(struct disk *disk)
131 dev_t dev = disk->d_dev;
133 LIST_REMOVE(disk, d_list);
134 bzero(disk, sizeof(*disk));
137 /* YYY remove cdevsw entries? */
142 disk_dumpcheck(dev_t dev, u_int *count, u_int *blkno, u_int *secsize)
145 struct disklabel *dl;
153 dl = dsgetlabel(dev, dp->d_slice);
156 *count = Maxmem * (PAGE_SIZE / dl->d_secsize);
157 if (dumplo <= LABELSECTOR ||
158 (dumplo + *count > dl->d_partitions[dkpart(dev)].p_size))
160 boff = dl->d_partitions[dkpart(dev)].p_offset +
161 dp->d_slice->dss_slices[dkslice(dev)].ds_offset;
162 *blkno = boff + dumplo;
163 *secsize = dl->d_secsize;
169 disk_invalidate (struct disk *disk)
172 dsgone(&disk->d_slice);
176 disk_enumerate(struct disk *disk)
179 return (LIST_FIRST(&disklist));
181 return (LIST_NEXT(disk, d_list));
185 sysctl_disks(SYSCTL_HANDLER_ARGS)
193 while ((disk = disk_enumerate(disk))) {
195 error = SYSCTL_OUT(req, " ", 1);
201 error = SYSCTL_OUT(req, disk->d_dev->si_name, strlen(disk->d_dev->si_name));
205 error = SYSCTL_OUT(req, "", 1);
209 SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, 0, NULL,
210 sysctl_disks, "A", "names of available disks");
213 * The port intercept functions
217 disk_putport(lwkt_port_t port, lwkt_msg_t lmsg)
219 struct disk *disk = (struct disk *)port;
220 cdevallmsg_t msg = (cdevallmsg_t)lmsg;
223 switch(msg->am_lmsg.ms_cmd) {
226 msg->am_open.msg.dev,
228 msg->am_open.devtype,
233 msg->am_close.msg.dev,
235 msg->am_close.devtype,
240 msg->am_ioctl.msg.dev,
246 case CDEV_CMD_STRATEGY:
247 diskstrategy(msg->am_strategy.bp);
251 msg->am_psize.result = diskpsize(msg->am_psize.msg.dev);
255 error = lwkt_forwardmsg(disk->d_fwdport, &msg->am_lmsg);
262 diskopen(dev_t dev, int oflags, int devtype, struct thread *td)
269 pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
275 while (dp->d_flags & DISKFLAG_LOCK) {
276 dp->d_flags |= DISKFLAG_WANTED;
277 error = tsleep(dp, PCATCH, "diskopen", hz);
281 dp->d_flags |= DISKFLAG_LOCK;
283 if (!dsisopen(dp->d_slice)) {
284 if (!pdev->si_iosize_max)
285 pdev->si_iosize_max = dev->si_iosize_max;
286 error = dev_port_dopen(dp->d_fwdport, pdev, oflags, devtype, td);
289 /* Inherit properties from the whole/raw dev_t */
290 inherit_raw(pdev, dev);
295 error = dsopen(dev, devtype, dp->d_dsflags, &dp->d_slice, &dp->d_label);
297 if (!dsisopen(dp->d_slice))
298 dev_port_dclose(dp->d_fwdport, pdev, oflags, devtype, td);
300 dp->d_flags &= ~DISKFLAG_LOCK;
301 if (dp->d_flags & DISKFLAG_WANTED) {
302 dp->d_flags &= ~DISKFLAG_WANTED;
310 diskclose(dev_t dev, int fflag, int devtype, struct thread *td)
317 pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
321 dsclose(dev, devtype, dp->d_slice);
322 if (!dsisopen(dp->d_slice))
323 error = dev_port_dclose(dp->d_fwdport, pdev, fflag, devtype, td);
328 diskstrategy(struct buf *bp)
333 pdev = dkmodpart(dkmodslice(bp->b_dev, WHOLE_DISK_SLICE), RAW_PART);
335 if (dp != bp->b_dev->si_disk)
336 inherit_raw(pdev, bp->b_dev);
340 bp->b_flags |= B_ERROR;
345 if (dscheck(bp, dp->d_slice) <= 0) {
349 dev_port_dstrategy(dp->d_fwdport, dp->d_dev, bp);
353 * note: when forwarding the ioctl we use the original device rather then
354 * the whole disk slice.
357 diskioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
363 pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
367 error = dsioctl(dev, cmd, data, fflag, &dp->d_slice);
368 if (error == ENOIOCTL)
369 error = dev_port_dioctl(dp->d_fwdport, dev, cmd, data, fflag, td);
379 pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
383 if (dp != dev->si_disk) {
384 dev->si_drv1 = pdev->si_drv1;
385 dev->si_drv2 = pdev->si_drv2;
386 /* XXX: don't set bp->b_dev->si_disk (?) */
388 return (dssize(dev, &dp->d_slice));
391 SYSCTL_DECL(_debug_sizeof);
393 SYSCTL_INT(_debug_sizeof, OID_AUTO, disklabel, CTLFLAG_RD,
394 0, sizeof(struct disklabel), "sizeof(struct disklabel)");
396 SYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD,
397 0, sizeof(struct diskslices), "sizeof(struct diskslices)");
399 SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD,
400 0, sizeof(struct disk), "sizeof(struct disk)");
404 * Seek sort for disks.
406 * The buf_queue keep two queues, sorted in ascending block order. The first
407 * queue holds those requests which are positioned after the current block
408 * (in the first request); the second, which starts at queue->switch_point,
409 * holds requests which came in after their block number was passed. Thus
410 * we implement a one way scan, retracting after reaching the end of the drive
411 * to the first request on the second queue, at which time it becomes the
414 * A one-way scan is natural because of the way UNIX read-ahead blocks are
418 bufqdisksort(bufq, bp)
419 struct buf_queue_head *bufq;
426 be = TAILQ_LAST(&bufq->queue, buf_queue);
428 * If the queue is empty or we are an
429 * ordered transaction, then it's easy.
431 if ((bq = bufq_first(bufq)) == NULL
432 || (bp->b_flags & B_ORDERED) != 0) {
433 bufq_insert_tail(bufq, bp);
435 } else if (bufq->insert_point != NULL) {
438 * A certain portion of the list is
439 * "locked" to preserve ordering, so
440 * we can only insert after the insert
443 bq = bufq->insert_point;
447 * If we lie before the last removed (currently active)
448 * request, and are not inserting ourselves into the
449 * "locked" portion of the list, then we must add ourselves
450 * to the second request list.
452 if (bp->b_pblkno < bufq->last_pblkno) {
454 bq = bufq->switch_point;
456 * If we are starting a new secondary list,
460 bufq->switch_point = bp;
461 bufq_insert_tail(bufq, bp);
465 * If we lie ahead of the current switch point,
466 * insert us before the switch point and move
469 if (bp->b_pblkno < bq->b_pblkno) {
470 bufq->switch_point = bp;
471 TAILQ_INSERT_BEFORE(bq, bp, b_act);
475 if (bufq->switch_point != NULL)
476 be = TAILQ_PREV(bufq->switch_point,
479 * If we lie between last_pblkno and bq,
482 if (bp->b_pblkno < bq->b_pblkno) {
483 TAILQ_INSERT_BEFORE(bq, bp, b_act);
490 * Request is at/after our current position in the list.
491 * Optimize for sequential I/O by seeing if we go at the tail.
493 if (bp->b_pblkno > be->b_pblkno) {
494 TAILQ_INSERT_AFTER(&bufq->queue, be, bp, b_act);
498 /* Otherwise, insertion sort */
499 while ((bn = TAILQ_NEXT(bq, b_act)) != NULL) {
502 * We want to go after the current request if it is the end
503 * of the first request list, or if the next request is a
504 * larger cylinder than our request.
506 if (bn == bufq->switch_point
507 || bp->b_pblkno < bn->b_pblkno)
511 TAILQ_INSERT_AFTER(&bufq->queue, bq, bp, b_act);
516 * Attempt to read a disk label from a device using the indicated strategy
517 * routine. The label must be partly set up before this: secpercyl, secsize
518 * and anything required in the strategy routine (e.g., dummy bounds for the
519 * partition containing the label) must be filled in before calling us.
520 * Returns NULL on success and an error string on failure.
523 readdisklabel(dev, lp)
525 struct disklabel *lp;
528 struct disklabel *dlp;
531 bp = geteblk((int)lp->d_secsize);
533 bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
534 bp->b_bcount = lp->d_secsize;
535 bp->b_flags &= ~B_INVAL;
536 bp->b_flags |= B_READ;
540 else for (dlp = (struct disklabel *)bp->b_data;
541 dlp <= (struct disklabel *)((char *)bp->b_data +
542 lp->d_secsize - sizeof(*dlp));
543 dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
544 if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) {
546 msg = "no disk label";
547 } else if (dlp->d_npartitions > MAXPARTITIONS ||
549 msg = "disk label corrupted";
556 bp->b_flags |= B_INVAL | B_AGE;
562 * Check new disk label for sensibility before setting it.
565 setdisklabel(olp, nlp, openmask)
566 struct disklabel *olp, *nlp;
570 struct partition *opp, *npp;
573 * Check it is actually a disklabel we are looking at.
575 if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC ||
579 * For each partition that we think is open,
581 while ((i = ffs((long)openmask)) != 0) {
584 * Check it is not changing....
586 openmask &= ~(1 << i);
587 if (nlp->d_npartitions <= i)
589 opp = &olp->d_partitions[i];
590 npp = &nlp->d_partitions[i];
591 if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size)
594 * Copy internally-set partition information
595 * if new label doesn't include it. XXX
596 * (If we are using it then we had better stay the same type)
597 * This is possibly dubious, as someone else noted (XXX)
599 if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) {
600 npp->p_fstype = opp->p_fstype;
601 npp->p_fsize = opp->p_fsize;
602 npp->p_frag = opp->p_frag;
603 npp->p_cpg = opp->p_cpg;
607 nlp->d_checksum = dkcksum(nlp);
613 * Write disk label back to device after modification.
616 writedisklabel(dev, lp)
618 struct disklabel *lp;
621 struct disklabel *dlp;
624 if (lp->d_partitions[RAW_PART].p_offset != 0)
625 return (EXDEV); /* not quite right */
626 bp = geteblk((int)lp->d_secsize);
627 bp->b_dev = dkmodpart(dev, RAW_PART);
628 bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
629 bp->b_bcount = lp->d_secsize;
632 * We read the label first to see if it's there,
633 * in which case we will put ours at the same offset into the block..
634 * (I think this is stupid [Julian])
635 * Note that you can't write a label out over a corrupted label!
636 * (also stupid.. how do you write the first one? by raw writes?)
638 bp->b_flags &= ~B_INVAL;
639 bp->b_flags |= B_READ;
644 for (dlp = (struct disklabel *)bp->b_data;
645 dlp <= (struct disklabel *)
646 ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp));
647 dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
648 if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC &&
651 bp->b_flags &= ~(B_DONE | B_READ);
652 bp->b_flags |= B_WRITE;
654 alpha_fix_srm_checksum(bp);
664 bzero(bp->b_data, lp->d_secsize);
665 dlp = (struct disklabel *)bp->b_data;
667 bp->b_flags &= ~B_INVAL;
668 bp->b_flags |= B_WRITE;
672 bp->b_flags |= B_INVAL | B_AGE;
678 * Disk error is the preface to plaintive error messages
679 * about failing disk transfers. It prints messages of the form
681 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
683 * if the offset of the error in the transfer and a disk label
684 * are both available. blkdone should be -1 if the position of the error
685 * is unknown; the disklabel pointer may be null from drivers that have not
686 * been converted to use them. The message is printed with printf
687 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
688 * The message should be completed (with at least a newline) with printf
689 * or addlog, respectively. There is no trailing space.
692 diskerr(bp, what, pri, blkdone, lp)
696 struct disklabel *lp;
698 int unit = dkunit(bp->b_dev);
699 int slice = dkslice(bp->b_dev);
700 int part = dkpart(bp->b_dev);
705 sname = dsname(bp->b_dev, unit, slice, part, partname);
706 printf("%s%s: %s %sing fsbn ", sname, partname, what,
707 bp->b_flags & B_READ ? "read" : "writ");
709 if (bp->b_bcount <= DEV_BSIZE)
710 printf("%ld", (long)sn);
714 printf("%ld of ", (long)sn);
716 printf("%ld-%ld", (long)bp->b_blkno,
717 (long)(bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE));
719 if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
721 sn *= DEV_BSIZE / lp->d_secsize; /* XXX */
723 sn += lp->d_partitions[part].p_offset;
725 * XXX should add slice offset and not print the slice,
726 * but we don't know the slice pointer.
727 * XXX should print bp->b_pblkno so that this will work
728 * independent of slices, labels and bad sector remapping,
729 * but some drivers don't set bp->b_pblkno.
731 printf(" (%s bn %ld; cn %ld", sname, (long)sn,
732 (long)(sn / lp->d_secpercyl));
733 sn %= (long)lp->d_secpercyl;
734 printf(" tn %ld sn %ld)", (long)(sn / lp->d_nsectors),
735 (long)(sn % lp->d_nsectors));