kernel - All lwkt thread now start out mpsafe part 1/2
[dragonfly.git] / sys / kern / subr_disk.c
CommitLineData
984263bc 1/*
149e86b9 2 * Copyright (c) 2003,2004,2009 The DragonFly Project. All rights reserved.
d7d5e114 3 *
8c10bfcf
MD
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
d7d5e114 6 *
8c10bfcf
MD
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
d7d5e114 10 *
8c10bfcf
MD
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
d7d5e114 20 *
8c10bfcf
MD
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
d7d5e114 33 *
984263bc
MD
34 * ----------------------------------------------------------------------------
35 * "THE BEER-WARE LICENSE" (Revision 42):
36 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
37 * can do whatever you want with this stuff. If we meet some day, and you think
38 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
39 * ----------------------------------------------------------------------------
40 *
7a9e53ad
MD
41 * Copyright (c) 1982, 1986, 1988, 1993
42 * The Regents of the University of California. All rights reserved.
43 * (c) UNIX System Laboratories, Inc.
44 * All or some portions of this file are derived from material licensed
45 * to the University of California by American Telephone and Telegraph
46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47 * the permission of UNIX System Laboratories, Inc.
48 *
49 * Redistribution and use in source and binary forms, with or without
50 * modification, are permitted provided that the following conditions
51 * are met:
52 * 1. Redistributions of source code must retain the above copyright
53 * notice, this list of conditions and the following disclaimer.
54 * 2. Redistributions in binary form must reproduce the above copyright
55 * notice, this list of conditions and the following disclaimer in the
56 * documentation and/or other materials provided with the distribution.
57 * 3. All advertising materials mentioning features or use of this software
58 * must display the following acknowledgement:
59 * This product includes software developed by the University of
60 * California, Berkeley and its contributors.
61 * 4. Neither the name of the University nor the names of its contributors
62 * may be used to endorse or promote products derived from this software
63 * without specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
66 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
69 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
70 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
71 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
72 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
73 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
74 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
75 * SUCH DAMAGE.
984263bc 76 *
7a9e53ad
MD
77 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
78 * $FreeBSD: src/sys/kern/subr_disk.c,v 1.20.2.6 2001/10/05 07:14:57 peter Exp $
79 * $FreeBSD: src/sys/ufs/ufs/ufs_disksubr.c,v 1.44.2.3 2001/03/05 05:42:19 obrien Exp $
3641b7ca 80 * $DragonFly: src/sys/kern/subr_disk.c,v 1.40 2008/06/05 18:06:32 swildner Exp $
984263bc
MD
81 */
82
83#include <sys/param.h>
84#include <sys/systm.h>
85#include <sys/kernel.h>
7a9e53ad 86#include <sys/proc.h>
984263bc
MD
87#include <sys/sysctl.h>
88#include <sys/buf.h>
89#include <sys/conf.h>
cd29885a
MD
90#include <sys/disklabel.h>
91#include <sys/disklabel32.h>
92#include <sys/disklabel64.h>
7a9e53ad 93#include <sys/diskslice.h>
cd29885a 94#include <sys/diskmbr.h>
984263bc 95#include <sys/disk.h>
b24cd69c 96#include <sys/kerneldump.h>
984263bc
MD
97#include <sys/malloc.h>
98#include <sys/sysctl.h>
99#include <machine/md_var.h>
100#include <sys/ctype.h>
7a9e53ad
MD
101#include <sys/syslog.h>
102#include <sys/device.h>
335dda38 103#include <sys/msgport.h>
2c1e28dd 104#include <sys/devfs.h>
be755ff9 105#include <sys/thread.h>
8c72e3d5 106#include <sys/dsched.h>
cd29885a
MD
107#include <sys/queue.h>
108#include <sys/lock.h>
f5d8307c 109#include <sys/udev.h>
984263bc 110
9f889dc4
MD
111#include <sys/buf2.h>
112#include <sys/mplock2.h>
113#include <sys/msgport2.h>
114#include <sys/thread2.h>
115
984263bc 116static MALLOC_DEFINE(M_DISK, "disk", "disk data");
8c05caab 117static int disk_debug_enable = 0;
984263bc 118
cd29885a
MD
119static void disk_msg_autofree_reply(lwkt_port_t, lwkt_msg_t);
120static void disk_msg_core(void *);
aec8eea4
MD
121static int disk_probe_slice(struct disk *dp, cdev_t dev, int slice, int reprobe);
122static void disk_probe(struct disk *dp, int reprobe);
123static void _setdiskinfo(struct disk *disk, struct disk_info *info);
30e5862e 124static void bioqwritereorder(struct bio_queue_head *bioq);
fbbbca99 125static void disk_cleanserial(char *serno);
cd29885a 126
984263bc 127static d_open_t diskopen;
d7d5e114 128static d_close_t diskclose;
984263bc 129static d_ioctl_t diskioctl;
fef8985e 130static d_strategy_t diskstrategy;
984263bc 131static d_psize_t diskpsize;
e4c9c0c8 132static d_clone_t diskclone;
fef8985e 133static d_dump_t diskdump;
984263bc
MD
134
135static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist);
be755ff9 136static struct lwkt_token disklist_token;
984263bc 137
fef8985e 138static struct dev_ops disk_ops = {
9f889dc4 139 { "disk", 0, D_DISK | D_MPSAFE },
fef8985e
MD
140 .d_open = diskopen,
141 .d_close = diskclose,
142 .d_read = physread,
143 .d_write = physwrite,
144 .d_ioctl = diskioctl,
145 .d_strategy = diskstrategy,
146 .d_dump = diskdump,
147 .d_psize = diskpsize,
148 .d_clone = diskclone
149};
150
cd29885a
MD
151static struct objcache *disk_msg_cache;
152
153struct objcache_malloc_args disk_msg_malloc_args = {
154 sizeof(struct disk_msg), M_DISK };
155
156static struct lwkt_port disk_dispose_port;
157static struct lwkt_port disk_msg_port;
158
8c05caab
AH
159static int
160disk_debug(int level, char *fmt, ...)
161{
162 __va_list ap;
163
164 __va_start(ap, fmt);
165 if (level <= disk_debug_enable)
166 kvprintf(fmt, ap);
167 __va_end(ap);
168
169 return 0;
170}
cd29885a
MD
171
172static int
aec8eea4 173disk_probe_slice(struct disk *dp, cdev_t dev, int slice, int reprobe)
cd29885a
MD
174{
175 struct disk_info *info = &dp->d_info;
176 struct diskslice *sp = &dp->d_slice->dss_slices[slice];
177 disklabel_ops_t ops;
178 struct partinfo part;
179 const char *msg;
180 cdev_t ndev;
440a040b 181 int sno;
539f339e 182 u_int i;
cd29885a 183
8c05caab
AH
184 disk_debug(2,
185 "disk_probe_slice (begin): %s (%s)\n",
186 dev->si_name, dp->d_cdev->si_name);
187
440a040b
MD
188 sno = slice ? slice - 1 : 0;
189
cd29885a
MD
190 ops = &disklabel32_ops;
191 msg = ops->op_readdisklabel(dev, sp, &sp->ds_label, info);
192 if (msg && !strcmp(msg, "no disk label")) {
cd29885a
MD
193 ops = &disklabel64_ops;
194 msg = ops->op_readdisklabel(dev, sp, &sp->ds_label, info);
195 }
cd29885a
MD
196 if (msg == NULL) {
197 if (slice != WHOLE_DISK_SLICE)
198 ops->op_adjust_label_reserved(dp->d_slice, slice, sp);
199 else
200 sp->ds_reserved = 0;
201
202 sp->ds_ops = ops;
cd29885a
MD
203 for (i = 0; i < ops->op_getnumparts(sp->ds_label); i++) {
204 ops->op_loadpartinfo(sp->ds_label, i, &part);
cd29885a 205 if (part.fstype) {
aec8eea4 206 if (reprobe &&
149e86b9 207 (ndev = devfs_find_device_by_name("%s%c",
440a040b 208 dev->si_name, 'a' + i))
149e86b9
MD
209 ) {
210 /*
211 * Device already exists and
212 * is still valid.
213 */
aec8eea4
MD
214 ndev->si_flags |= SI_REPROBE_TEST;
215 } else {
8f960aa9 216 ndev = make_dev_covering(&disk_ops, dp->d_rawdev->si_ops,
149e86b9
MD
217 dkmakeminor(dkunit(dp->d_cdev),
218 slice, i),
aec8eea4 219 UID_ROOT, GID_OPERATOR, 0640,
440a040b 220 "%s%c", dev->si_name, 'a'+ i);
aec8eea4 221 ndev->si_disk = dp;
f5d8307c
AH
222 udev_dict_set_cstr(ndev, "subsystem", "disk");
223 /* Inherit parent's disk type */
224 if (dp->d_disktype) {
225 udev_dict_set_cstr(ndev, "disk-type",
226 __DECONST(char *, dp->d_disktype));
227 }
55230951 228 if (dp->d_info.d_serialno) {
149e86b9
MD
229 make_dev_alias(ndev,
230 "serno/%s.s%d%c",
231 dp->d_info.d_serialno,
440a040b 232 sno, 'a' + i);
55230951 233 }
aec8eea4
MD
234 ndev->si_flags |= SI_REPROBE_TEST;
235 }
cd29885a
MD
236 }
237 }
238 } else if (info->d_dsflags & DSO_COMPATLABEL) {
239 msg = NULL;
240 if (sp->ds_size >= 0x100000000ULL)
241 ops = &disklabel64_ops;
242 else
243 ops = &disklabel32_ops;
244 sp->ds_label = ops->op_clone_label(info, sp);
245 } else {
dc6af901
TN
246 if (sp->ds_type == DOSPTYP_386BSD || /* XXX */
247 sp->ds_type == DOSPTYP_NETBSD ||
248 sp->ds_type == DOSPTYP_OPENBSD) {
cd29885a
MD
249 log(LOG_WARNING, "%s: cannot find label (%s)\n",
250 dev->si_name, msg);
149e86b9 251 }
cd29885a
MD
252 }
253
254 if (msg == NULL) {
255 sp->ds_wlabel = FALSE;
256 }
257
258 return (msg ? EINVAL : 0);
259}
260
666ec833
MD
261/*
262 * This routine is only called for newly minted drives or to reprobe
263 * a drive with no open slices. disk_probe_slice() is called directly
264 * when reprobing partition changes within slices.
265 */
cd29885a 266static void
aec8eea4 267disk_probe(struct disk *dp, int reprobe)
cd29885a
MD
268{
269 struct disk_info *info = &dp->d_info;
270 cdev_t dev = dp->d_cdev;
271 cdev_t ndev;
0831f2ab 272 int error, i, sno;
666ec833 273 struct diskslices *osp;
0831f2ab 274 struct diskslice *sp;
cd29885a 275
cd29885a 276 KKASSERT (info->d_media_blksize != 0);
cd29885a 277
666ec833 278 osp = dp->d_slice;
cd29885a 279 dp->d_slice = dsmakeslicestruct(BASE_SLICE, info);
666ec833 280 disk_debug(1, "disk_probe (begin): %s\n", dp->d_cdev->si_name);
cd29885a
MD
281
282 error = mbrinit(dev, info, &(dp->d_slice));
666ec833
MD
283 if (error) {
284 dsgone(&osp);
cd29885a 285 return;
666ec833 286 }
aec8eea4 287
0831f2ab
MD
288 for (i = 0; i < dp->d_slice->dss_nslices; i++) {
289 /*
290 * Ignore the whole-disk slice, it has already been created.
291 */
292 if (i == WHOLE_DISK_SLICE)
293 continue;
294 sp = &dp->d_slice->dss_slices[i];
295
296 /*
297 * Handle s0. s0 is a compatibility slice if there are no
298 * other slices and it has not otherwise been set up, else
299 * we ignore it.
300 */
301 if (i == COMPATIBILITY_SLICE) {
302 sno = 0;
303 if (sp->ds_type == 0 &&
304 dp->d_slice->dss_nslices == BASE_SLICE) {
305 sp->ds_size = info->d_media_blocks;
306 sp->ds_reserved = 0;
aec8eea4 307 }
0831f2ab
MD
308 } else {
309 sno = i - 1;
310 sp->ds_reserved = 0;
cd29885a 311 }
0831f2ab
MD
312
313 /*
314 * Ignore 0-length slices
315 */
316 if (sp->ds_size == 0)
317 continue;
318
319 if (reprobe &&
320 (ndev = devfs_find_device_by_name("%ss%d",
321 dev->si_name, sno))) {
322 /*
323 * Device already exists and is still valid
324 */
325 ndev->si_flags |= SI_REPROBE_TEST;
326 } else {
327 /*
328 * Else create new device
329 */
8f960aa9 330 ndev = make_dev_covering(&disk_ops, dp->d_rawdev->si_ops,
aec8eea4
MD
331 dkmakewholeslice(dkunit(dev), i),
332 UID_ROOT, GID_OPERATOR, 0640,
0831f2ab 333 "%ss%d", dev->si_name, sno);
f5d8307c
AH
334 udev_dict_set_cstr(ndev, "subsystem", "disk");
335 /* Inherit parent's disk type */
336 if (dp->d_disktype) {
337 udev_dict_set_cstr(ndev, "disk-type",
338 __DECONST(char *, dp->d_disktype));
339 }
0831f2ab
MD
340 if (dp->d_info.d_serialno) {
341 make_dev_alias(ndev, "serno/%s.s%d",
342 dp->d_info.d_serialno, sno);
cd29885a 343 }
0831f2ab
MD
344 ndev->si_disk = dp;
345 ndev->si_flags |= SI_REPROBE_TEST;
346 }
347 sp->ds_dev = ndev;
149e86b9 348
1cb0bdb6
MD
349 /*
350 * Probe appropriate slices for a disklabel
351 *
352 * XXX slice type 1 used by our gpt probe code.
d7d5e114 353 * XXX slice type 0 used by mbr compat slice.
1cb0bdb6 354 */
dc6af901
TN
355 if (sp->ds_type == DOSPTYP_386BSD ||
356 sp->ds_type == DOSPTYP_NETBSD ||
357 sp->ds_type == DOSPTYP_OPENBSD ||
358 sp->ds_type == 0 ||
359 sp->ds_type == 1) {
0831f2ab
MD
360 if (dp->d_slice->dss_first_bsd_slice == 0)
361 dp->d_slice->dss_first_bsd_slice = i;
362 disk_probe_slice(dp, ndev, i, reprobe);
cd29885a
MD
363 }
364 }
666ec833
MD
365 dsgone(&osp);
366 disk_debug(1, "disk_probe (end): %s\n", dp->d_cdev->si_name);
cd29885a
MD
367}
368
369
370static void
371disk_msg_core(void *arg)
372{
cd29885a
MD
373 struct disk *dp;
374 struct diskslice *sp;
149e86b9
MD
375 disk_msg_t msg;
376 int run;
cd29885a 377
c9e9fb21 378 lwkt_gettoken(&disklist_token);
cd29885a 379 lwkt_initport_thread(&disk_msg_port, curthread);
c9e9fb21
MD
380 wakeup(curthread); /* synchronous startup */
381 lwkt_reltoken(&disklist_token);
382
383 get_mplock(); /* not mpsafe yet? */
149e86b9 384 run = 1;
cd29885a 385
149e86b9
MD
386 while (run) {
387 msg = (disk_msg_t)lwkt_waitport(&disk_msg_port, 0);
cd29885a 388
149e86b9
MD
389 switch (msg->hdr.u.ms_result) {
390 case DISK_DISK_PROBE:
cd29885a 391 dp = (struct disk *)msg->load;
8c05caab
AH
392 disk_debug(1,
393 "DISK_DISK_PROBE: %s\n",
394 dp->d_cdev->si_name);
aec8eea4 395 disk_probe(dp, 0);
cd29885a 396 break;
cd29885a
MD
397 case DISK_DISK_DESTROY:
398 dp = (struct disk *)msg->load;
8c05caab
AH
399 disk_debug(1,
400 "DISK_DISK_DESTROY: %s\n",
401 dp->d_cdev->si_name);
cd29885a
MD
402 devfs_destroy_subnames(dp->d_cdev->si_name);
403 devfs_destroy_dev(dp->d_cdev);
3b998fa9 404 lwkt_gettoken(&disklist_token);
aec8eea4 405 LIST_REMOVE(dp, d_list);
3b998fa9 406 lwkt_reltoken(&disklist_token);
55230951
MD
407 if (dp->d_info.d_serialno) {
408 kfree(dp->d_info.d_serialno, M_TEMP);
409 dp->d_info.d_serialno = NULL;
410 }
cd29885a 411 break;
aec8eea4
MD
412 case DISK_UNPROBE:
413 dp = (struct disk *)msg->load;
8c05caab
AH
414 disk_debug(1,
415 "DISK_DISK_UNPROBE: %s\n",
416 dp->d_cdev->si_name);
aec8eea4
MD
417 devfs_destroy_subnames(dp->d_cdev->si_name);
418 break;
cd29885a
MD
419 case DISK_SLICE_REPROBE:
420 dp = (struct disk *)msg->load;
421 sp = (struct diskslice *)msg->load2;
149e86b9
MD
422 devfs_clr_subnames_flag(sp->ds_dev->si_name,
423 SI_REPROBE_TEST);
8c05caab 424 disk_debug(1,
aec8eea4
MD
425 "DISK_SLICE_REPROBE: %s\n",
426 sp->ds_dev->si_name);
149e86b9
MD
427 disk_probe_slice(dp, sp->ds_dev,
428 dkslice(sp->ds_dev), 1);
429 devfs_destroy_subnames_without_flag(
430 sp->ds_dev->si_name, SI_REPROBE_TEST);
cd29885a 431 break;
cd29885a
MD
432 case DISK_DISK_REPROBE:
433 dp = (struct disk *)msg->load;
aec8eea4 434 devfs_clr_subnames_flag(dp->d_cdev->si_name, SI_REPROBE_TEST);
8c05caab 435 disk_debug(1,
aec8eea4
MD
436 "DISK_DISK_REPROBE: %s\n",
437 dp->d_cdev->si_name);
438 disk_probe(dp, 1);
149e86b9
MD
439 devfs_destroy_subnames_without_flag(
440 dp->d_cdev->si_name, SI_REPROBE_TEST);
cd29885a 441 break;
cd29885a 442 case DISK_SYNC:
8c05caab 443 disk_debug(1, "DISK_SYNC\n");
cd29885a 444 break;
149e86b9
MD
445 default:
446 devfs_debug(DEVFS_DEBUG_WARNING,
447 "disk_msg_core: unknown message "
448 "received at core\n");
449 break;
450 }
a9177e09 451 lwkt_replymsg(&msg->hdr, 0);
149e86b9 452 }
cd29885a
MD
453 lwkt_exit();
454}
455
456
149e86b9
MD
457/*
458 * Acts as a message drain. Any message that is replied to here gets
459 * destroyed and the memory freed.
460 */
cd29885a
MD
461static void
462disk_msg_autofree_reply(lwkt_port_t port, lwkt_msg_t msg)
463{
149e86b9 464 objcache_put(disk_msg_cache, msg);
cd29885a
MD
465}
466
467
468void
469disk_msg_send(uint32_t cmd, void *load, void *load2)
470{
149e86b9 471 disk_msg_t disk_msg;
cd29885a
MD
472 lwkt_port_t port = &disk_msg_port;
473
149e86b9 474 disk_msg = objcache_get(disk_msg_cache, M_WAITOK);
cd29885a 475
149e86b9 476 lwkt_initmsg(&disk_msg->hdr, &disk_dispose_port, 0);
cd29885a
MD
477
478 disk_msg->hdr.u.ms_result = cmd;
479 disk_msg->load = load;
480 disk_msg->load2 = load2;
481 KKASSERT(port);
a9177e09 482 lwkt_sendmsg(port, &disk_msg->hdr);
cd29885a
MD
483}
484
aec8eea4
MD
485void
486disk_msg_send_sync(uint32_t cmd, void *load, void *load2)
487{
488 struct lwkt_port rep_port;
a9177e09
MD
489 disk_msg_t disk_msg;
490 lwkt_port_t port;
aec8eea4 491
a9177e09
MD
492 disk_msg = objcache_get(disk_msg_cache, M_WAITOK);
493 port = &disk_msg_port;
494
495 /* XXX could probably use curthread's built-in msgport */
aec8eea4
MD
496 lwkt_initport_thread(&rep_port, curthread);
497 lwkt_initmsg(&disk_msg->hdr, &rep_port, 0);
498
499 disk_msg->hdr.u.ms_result = cmd;
500 disk_msg->load = load;
501 disk_msg->load2 = load2;
502
a9177e09
MD
503 lwkt_sendmsg(port, &disk_msg->hdr);
504 lwkt_waitmsg(&disk_msg->hdr, 0);
505 objcache_put(disk_msg_cache, disk_msg);
aec8eea4
MD
506}
507
335dda38 508/*
fef8985e
MD
509 * Create a raw device for the dev_ops template (which is returned). Also
510 * create a slice and unit managed disk and overload the user visible
511 * device space with it.
e4c9c0c8 512 *
fef8985e
MD
513 * NOTE: The returned raw device is NOT a slice and unit managed device.
514 * It is an actual raw device representing the raw disk as specified by
515 * the passed dev_ops. The disk layer not only returns such a raw device,
516 * it also uses it internally when passing (modified) commands through.
335dda38 517 */
b13267a5 518cdev_t
a688b15c 519disk_create(int unit, struct disk *dp, struct dev_ops *raw_ops)
984263bc 520{
c6ef65ea
AH
521 return disk_create_named(NULL, unit, dp, raw_ops);
522}
523
524cdev_t
525disk_create_named(const char *name, int unit, struct disk *dp, struct dev_ops *raw_ops)
526{
b13267a5 527 cdev_t rawdev;
e4c9c0c8 528
c6ef65ea
AH
529 if (name == NULL)
530 name = raw_ops->head.name;
531
532 disk_debug(1, "disk_create (begin): %s%d\n", name, unit);
8c05caab 533
cd29885a 534 rawdev = make_only_dev(raw_ops, dkmakewholedisk(unit),
e4c9c0c8 535 UID_ROOT, GID_OPERATOR, 0640,
c6ef65ea 536 "%s%d", name, unit);
984263bc 537
cd29885a 538 bzero(dp, sizeof(*dp));
fef8985e 539
e4c9c0c8 540 dp->d_rawdev = rawdev;
fef8985e 541 dp->d_raw_ops = raw_ops;
cd29885a 542 dp->d_dev_ops = &disk_ops;
8f960aa9 543 dp->d_cdev = make_dev_covering(&disk_ops, dp->d_rawdev->si_ops,
5350e1e9 544 dkmakewholedisk(unit),
e4c9c0c8 545 UID_ROOT, GID_OPERATOR, 0640,
c6ef65ea 546 "%s%d", name, unit);
f5d8307c 547 udev_dict_set_cstr(dp->d_cdev, "subsystem", "disk");
cd29885a
MD
548 dp->d_cdev->si_disk = dp;
549
c6ef65ea 550 dsched_disk_create_callback(dp, name, unit);
e02e815e 551
3b998fa9 552 lwkt_gettoken(&disklist_token);
984263bc 553 LIST_INSERT_HEAD(&disklist, dp, d_list);
3b998fa9 554 lwkt_reltoken(&disklist_token);
8c05caab 555
c6ef65ea 556 disk_debug(1, "disk_create (end): %s%d\n", name, unit);
8c05caab 557
e4c9c0c8 558 return (dp->d_rawdev);
984263bc
MD
559}
560
f5d8307c
AH
561int
562disk_setdisktype(struct disk *disk, const char *type)
563{
564 KKASSERT(disk != NULL);
565
566 disk->d_disktype = type;
567 return udev_dict_set_cstr(disk->d_cdev, "disk-type", __DECONST(char *, type));
568}
569
aec8eea4
MD
570static void
571_setdiskinfo(struct disk *disk, struct disk_info *info)
a688b15c 572{
55230951
MD
573 char *oldserialno;
574
55230951 575 oldserialno = disk->d_info.d_serialno;
a688b15c
MD
576 bcopy(info, &disk->d_info, sizeof(disk->d_info));
577 info = &disk->d_info;
578
8c05caab
AH
579 disk_debug(1,
580 "_setdiskinfo: %s\n",
581 disk->d_cdev->si_name);
582
55230951
MD
583 /*
584 * The serial number is duplicated so the caller can throw
585 * their copy away.
586 */
587 if (info->d_serialno && info->d_serialno[0]) {
588 info->d_serialno = kstrdup(info->d_serialno, M_TEMP);
fbbbca99 589 disk_cleanserial(info->d_serialno);
55230951
MD
590 if (disk->d_cdev) {
591 make_dev_alias(disk->d_cdev, "serno/%s",
592 info->d_serialno);
593 }
594 } else {
595 info->d_serialno = NULL;
596 }
597 if (oldserialno)
598 kfree(oldserialno, M_TEMP);
599
279e9fd5
AH
600 dsched_disk_update_callback(disk, info);
601
55230951
MD
602 /*
603 * The caller may set d_media_size or d_media_blocks and we
604 * calculate the other.
605 */
a688b15c
MD
606 KKASSERT(info->d_media_size == 0 || info->d_media_blksize == 0);
607 if (info->d_media_size == 0 && info->d_media_blocks) {
d7d5e114 608 info->d_media_size = (u_int64_t)info->d_media_blocks *
a688b15c 609 info->d_media_blksize;
d7d5e114 610 } else if (info->d_media_size && info->d_media_blocks == 0 &&
a688b15c 611 info->d_media_blksize) {
d7d5e114 612 info->d_media_blocks = info->d_media_size /
a688b15c
MD
613 info->d_media_blksize;
614 }
5d6c6885
MD
615
616 /*
617 * The si_* fields for rawdev are not set until after the
618 * disk_create() call, so someone using the cooked version
619 * of the raw device (i.e. da0s0) will not get the right
620 * si_iosize_max unless we fix it up here.
621 */
622 if (disk->d_cdev && disk->d_rawdev &&
623 disk->d_cdev->si_iosize_max == 0) {
624 disk->d_cdev->si_iosize_max = disk->d_rawdev->si_iosize_max;
625 disk->d_cdev->si_bsize_phys = disk->d_rawdev->si_bsize_phys;
626 disk->d_cdev->si_bsize_best = disk->d_rawdev->si_bsize_best;
627 }
d5cc18b0
AHJ
628
629 /* Add the serial number to the udev_dictionary */
630 if (info->d_serialno)
631 udev_dict_set_cstr(disk->d_cdev, "serno", info->d_serialno);
aec8eea4 632}
cd29885a 633
aec8eea4
MD
634/*
635 * Disk drivers must call this routine when media parameters are available
636 * or have changed.
637 */
638void
639disk_setdiskinfo(struct disk *disk, struct disk_info *info)
640{
641 _setdiskinfo(disk, info);
cd29885a 642 disk_msg_send(DISK_DISK_PROBE, disk, NULL);
8c05caab
AH
643 disk_debug(1,
644 "disk_setdiskinfo: sent probe for %s\n",
645 disk->d_cdev->si_name);
a688b15c
MD
646}
647
aec8eea4
MD
648void
649disk_setdiskinfo_sync(struct disk *disk, struct disk_info *info)
650{
651 _setdiskinfo(disk, info);
aec8eea4 652 disk_msg_send_sync(DISK_DISK_PROBE, disk, NULL);
8c05caab
AH
653 disk_debug(1,
654 "disk_setdiskinfo_sync: sent probe for %s\n",
655 disk->d_cdev->si_name);
aec8eea4
MD
656}
657
a688b15c 658/*
e4c9c0c8
MD
659 * This routine is called when an adapter detaches. The higher level
660 * managed disk device is destroyed while the lower level raw device is
661 * released.
662 */
335dda38
MD
663void
664disk_destroy(struct disk *disk)
665{
0160356d 666 dsched_disk_destroy_callback(disk);
aec8eea4 667 disk_msg_send_sync(DISK_DISK_DESTROY, disk, NULL);
cd29885a 668 return;
335dda38
MD
669}
670
984263bc 671int
b24cd69c 672disk_dumpcheck(cdev_t dev, u_int64_t *size, u_int64_t *blkno, u_int32_t *secsize)
984263bc 673{
a6c0f342
MD
674 struct partinfo pinfo;
675 int error;
984263bc 676
a6c0f342 677 bzero(&pinfo, sizeof(pinfo));
87baaf0c
MD
678 error = dev_dioctl(dev, DIOCGPART, (void *)&pinfo, 0,
679 proc0.p_ucred, NULL);
a6c0f342
MD
680 if (error)
681 return (error);
b24cd69c 682
a6c0f342 683 if (pinfo.media_blksize == 0)
984263bc 684 return (ENXIO);
b24cd69c
AH
685
686 if (blkno) /* XXX: make sure this reserved stuff is right */
687 *blkno = pinfo.reserved_blocks +
688 pinfo.media_offset / pinfo.media_blksize;
689 if (secsize)
690 *secsize = pinfo.media_blksize;
691 if (size)
692 *size = (pinfo.media_blocks - pinfo.reserved_blocks);
693
984263bc 694 return (0);
984263bc
MD
695}
696
b24cd69c
AH
697int
698disk_dumpconf(cdev_t dev, u_int onoff)
699{
700 struct dumperinfo di;
701 u_int64_t size, blkno;
702 u_int32_t secsize;
703 int error;
704
705 if (!onoff)
706 return set_dumper(NULL);
707
708 error = disk_dumpcheck(dev, &size, &blkno, &secsize);
709
710 if (error)
711 return ENXIO;
712
713 bzero(&di, sizeof(struct dumperinfo));
714 di.dumper = diskdump;
715 di.priv = dev;
716 di.blocksize = secsize;
717 di.mediaoffset = blkno * DEV_BSIZE;
718 di.mediasize = size * DEV_BSIZE;
719
720 return set_dumper(&di);
721}
722
aec8eea4
MD
723void
724disk_unprobe(struct disk *disk)
725{
726 if (disk == NULL)
727 return;
728
729 disk_msg_send_sync(DISK_UNPROBE, disk, NULL);
730}
731
d7d5e114 732void
984263bc
MD
733disk_invalidate (struct disk *disk)
734{
666ec833 735 dsgone(&disk->d_slice);
984263bc
MD
736}
737
984263bc
MD
738struct disk *
739disk_enumerate(struct disk *disk)
740{
be755ff9 741 struct disk *dp;
be755ff9 742
3b998fa9 743 lwkt_gettoken(&disklist_token);
984263bc 744 if (!disk)
be755ff9 745 dp = (LIST_FIRST(&disklist));
984263bc 746 else
be755ff9 747 dp = (LIST_NEXT(disk, d_list));
3b998fa9 748 lwkt_reltoken(&disklist_token);
be755ff9
AH
749
750 return dp;
984263bc
MD
751}
752
d7d5e114 753static
fbda7fa6 754int
984263bc
MD
755sysctl_disks(SYSCTL_HANDLER_ARGS)
756{
757 struct disk *disk;
758 int error, first;
759
760 disk = NULL;
761 first = 1;
762
763 while ((disk = disk_enumerate(disk))) {
764 if (!first) {
765 error = SYSCTL_OUT(req, " ", 1);
766 if (error)
767 return error;
768 } else {
769 first = 0;
770 }
95ce4036
HP
771 error = SYSCTL_OUT(req, disk->d_rawdev->si_name,
772 strlen(disk->d_rawdev->si_name));
984263bc
MD
773 if (error)
774 return error;
775 }
776 error = SYSCTL_OUT(req, "", 1);
777 return error;
778}
d7d5e114 779
3641b7ca 780SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
984263bc
MD
781 sysctl_disks, "A", "names of available disks");
782
783/*
e4c9c0c8
MD
784 * Open a disk device or partition.
785 */
fbda7fa6
MD
786static
787int
fef8985e 788diskopen(struct dev_open_args *ap)
984263bc 789{
b13267a5 790 cdev_t dev = ap->a_head.a_dev;
984263bc
MD
791 struct disk *dp;
792 int error;
793
e4c9c0c8
MD
794 /*
795 * dp can't be NULL here XXX.
7ba1363d
MD
796 *
797 * d_slice will be NULL if setdiskinfo() has not been called yet.
798 * setdiskinfo() is typically called whether the disk is present
799 * or not (e.g. CD), but the base disk device is created first
800 * and there may be a race.
e4c9c0c8 801 */
e4c9c0c8 802 dp = dev->si_disk;
7ba1363d 803 if (dp == NULL || dp->d_slice == NULL)
984263bc 804 return (ENXIO);
fef8985e 805 error = 0;
984263bc 806
e4c9c0c8
MD
807 /*
808 * Deal with open races
809 */
9f889dc4 810 get_mplock();
984263bc
MD
811 while (dp->d_flags & DISKFLAG_LOCK) {
812 dp->d_flags |= DISKFLAG_WANTED;
377d4740 813 error = tsleep(dp, PCATCH, "diskopen", hz);
9f889dc4
MD
814 if (error) {
815 rel_mplock();
984263bc 816 return (error);
9f889dc4 817 }
984263bc
MD
818 }
819 dp->d_flags |= DISKFLAG_LOCK;
820
e4c9c0c8
MD
821 /*
822 * Open the underlying raw device.
823 */
984263bc 824 if (!dsisopen(dp->d_slice)) {
e4c9c0c8 825#if 0
984263bc
MD
826 if (!pdev->si_iosize_max)
827 pdev->si_iosize_max = dev->si_iosize_max;
e4c9c0c8 828#endif
fef8985e
MD
829 error = dev_dopen(dp->d_rawdev, ap->a_oflags,
830 ap->a_devtype, ap->a_cred);
984263bc 831 }
cd29885a 832#if 0
e4c9c0c8
MD
833 /*
834 * Inherit properties from the underlying device now that it is
835 * open.
836 */
fef8985e 837 dev_dclone(dev);
cd29885a 838#endif
984263bc
MD
839
840 if (error)
841 goto out;
a688b15c 842 error = dsopen(dev, ap->a_devtype, dp->d_info.d_dsflags,
84f8b009 843 &dp->d_slice, &dp->d_info);
cd29885a 844 if (!dsisopen(dp->d_slice)) {
fef8985e 845 dev_dclose(dp->d_rawdev, ap->a_oflags, ap->a_devtype);
cd29885a 846 }
d7d5e114 847out:
984263bc
MD
848 dp->d_flags &= ~DISKFLAG_LOCK;
849 if (dp->d_flags & DISKFLAG_WANTED) {
850 dp->d_flags &= ~DISKFLAG_WANTED;
851 wakeup(dp);
852 }
9f889dc4 853 rel_mplock();
d7d5e114 854
984263bc
MD
855 return(error);
856}
857
e4c9c0c8
MD
858/*
859 * Close a disk device or partition
860 */
fbda7fa6
MD
861static
862int
fef8985e 863diskclose(struct dev_close_args *ap)
984263bc 864{
b13267a5 865 cdev_t dev = ap->a_head.a_dev;
984263bc
MD
866 struct disk *dp;
867 int error;
984263bc
MD
868
869 error = 0;
e4c9c0c8
MD
870 dp = dev->si_disk;
871
9f889dc4 872 get_mplock();
fef8985e 873 dsclose(dev, ap->a_devtype, dp->d_slice);
cd29885a 874 if (!dsisopen(dp->d_slice)) {
fef8985e 875 error = dev_dclose(dp->d_rawdev, ap->a_fflag, ap->a_devtype);
cd29885a 876 }
9f889dc4 877 rel_mplock();
fef8985e
MD
878 return (error);
879}
880
881/*
d7d5e114 882 * First execute the ioctl on the disk device, and if it isn't supported
fef8985e
MD
883 * try running it on the backing device.
884 */
885static
886int
887diskioctl(struct dev_ioctl_args *ap)
888{
b13267a5 889 cdev_t dev = ap->a_head.a_dev;
fef8985e
MD
890 struct disk *dp;
891 int error;
b24cd69c 892 u_int u;
fef8985e
MD
893
894 dp = dev->si_disk;
895 if (dp == NULL)
896 return (ENXIO);
cd29885a 897
149e86b9 898 devfs_debug(DEVFS_DEBUG_DEBUG,
fcefa6f2 899 "diskioctl: cmd is: %lx (name: %s)\n",
149e86b9
MD
900 ap->a_cmd, dev->si_name);
901 devfs_debug(DEVFS_DEBUG_DEBUG,
fcefa6f2 902 "diskioctl: &dp->d_slice is: %p, %p\n",
149e86b9 903 &dp->d_slice, dp->d_slice);
cd29885a 904
b24cd69c
AH
905 if (ap->a_cmd == DIOCGKERNELDUMP) {
906 u = *(u_int *)ap->a_data;
907 return disk_dumpconf(dev, u);
908 }
909
c6ef65ea
AH
910 if (&dp->d_slice == NULL || dp->d_slice == NULL) {
911 error = ENOIOCTL;
912 } else {
9f889dc4 913 get_mplock();
c6ef65ea
AH
914 error = dsioctl(dev, ap->a_cmd, ap->a_data, ap->a_fflag,
915 &dp->d_slice, &dp->d_info);
9f889dc4 916 rel_mplock();
c6ef65ea 917 }
cd29885a 918
fef8985e
MD
919 if (error == ENOIOCTL) {
920 error = dev_dioctl(dp->d_rawdev, ap->a_cmd, ap->a_data,
87baaf0c 921 ap->a_fflag, ap->a_cred, NULL);
fef8985e 922 }
984263bc
MD
923 return (error);
924}
925
e4c9c0c8
MD
926/*
927 * Execute strategy routine
928 */
fbda7fa6 929static
fef8985e
MD
930int
931diskstrategy(struct dev_strategy_args *ap)
984263bc 932{
b13267a5 933 cdev_t dev = ap->a_head.a_dev;
fef8985e 934 struct bio *bio = ap->a_bio;
81b5c339 935 struct bio *nbio;
984263bc
MD
936 struct disk *dp;
937
81b5c339 938 dp = dev->si_disk;
984263bc 939
e4c9c0c8 940 if (dp == NULL) {
81b5c339
MD
941 bio->bio_buf->b_error = ENXIO;
942 bio->bio_buf->b_flags |= B_ERROR;
943 biodone(bio);
fef8985e 944 return(0);
984263bc 945 }
81b5c339 946 KKASSERT(dev->si_disk == dp);
984263bc 947
6f76c57e
HP
948 /*
949 * The dscheck() function will also transform the slice relative
54078292 950 * block number i.e. bio->bio_offset into a block number that can be
9a71d53f
MD
951 * passed directly to the underlying raw device. If dscheck()
952 * returns NULL it will have handled the bio for us (e.g. EOF
953 * or error due to being beyond the device size).
6f76c57e 954 */
cd29885a 955 if ((nbio = dscheck(dev, bio, dp->d_slice)) != NULL) {
8c72e3d5 956 dsched_queue(dp, nbio);
cd29885a 957 } else {
81b5c339 958 biodone(bio);
cd29885a 959 }
fef8985e 960 return(0);
984263bc
MD
961}
962
335dda38 963/*
fef8985e 964 * Return the partition size in ?blocks?
335dda38 965 */
fbda7fa6
MD
966static
967int
fef8985e 968diskpsize(struct dev_psize_args *ap)
984263bc 969{
b13267a5 970 cdev_t dev = ap->a_head.a_dev;
984263bc 971 struct disk *dp;
984263bc 972
e4c9c0c8
MD
973 dp = dev->si_disk;
974 if (dp == NULL)
fef8985e
MD
975 return(ENODEV);
976 ap->a_result = dssize(dev, &dp->d_slice);
977 return(0);
984263bc
MD
978}
979
e4c9c0c8 980/*
fef8985e
MD
981 * When new device entries are instantiated, make sure they inherit our
982 * si_disk structure and block and iosize limits from the raw device.
e4c9c0c8 983 *
d7d5e114 984 * This routine is always called synchronously in the context of the
fef8985e
MD
985 * client.
986 *
987 * XXX The various io and block size constraints are not always initialized
988 * properly by devices.
e4c9c0c8 989 */
fbda7fa6
MD
990static
991int
fef8985e 992diskclone(struct dev_clone_args *ap)
984263bc 993{
b13267a5 994 cdev_t dev = ap->a_head.a_dev;
984263bc 995 struct disk *dp;
aec8eea4
MD
996 dp = dev->si_disk;
997
fef8985e
MD
998 KKASSERT(dp != NULL);
999 dev->si_disk = dp;
1000 dev->si_iosize_max = dp->d_rawdev->si_iosize_max;
1001 dev->si_bsize_phys = dp->d_rawdev->si_bsize_phys;
1002 dev->si_bsize_best = dp->d_rawdev->si_bsize_best;
1003 return(0);
1004}
1005
1006int
1007diskdump(struct dev_dump_args *ap)
1008{
b13267a5 1009 cdev_t dev = ap->a_head.a_dev;
aec8eea4 1010 struct disk *dp = dev->si_disk;
b24cd69c 1011 u_int64_t size, offset;
fef8985e
MD
1012 int error;
1013
b24cd69c
AH
1014 error = disk_dumpcheck(dev, &size, &ap->a_blkno, &ap->a_secsize);
1015 /* XXX: this should probably go in disk_dumpcheck somehow */
1016 if (ap->a_length != 0) {
1017 size *= DEV_BSIZE;
1018 offset = ap->a_blkno * DEV_BSIZE;
1019 if ((ap->a_offset < offset) ||
1020 (ap->a_offset + ap->a_length - offset > size)) {
1021 kprintf("Attempt to write outside dump device boundaries.\n");
1022 error = ENOSPC;
1023 }
1024 }
1025
fef8985e
MD
1026 if (error == 0) {
1027 ap->a_head.a_dev = dp->d_rawdev;
1028 error = dev_doperate(&ap->a_head);
984263bc 1029 }
fef8985e
MD
1030
1031 return(error);
984263bc
MD
1032}
1033
fef8985e 1034
d7d5e114 1035SYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD,
984263bc
MD
1036 0, sizeof(struct diskslices), "sizeof(struct diskslices)");
1037
d7d5e114 1038SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD,
984263bc 1039 0, sizeof(struct disk), "sizeof(struct disk)");
7a9e53ad 1040
ef548879 1041/*
4afeea0d
MD
1042 * Reorder interval for burst write allowance and minor write
1043 * allowance.
1044 *
1045 * We always want to trickle some writes in to make use of the
1046 * disk's zone cache. Bursting occurs on a longer interval and only
1047 * runningbufspace is well over the hirunningspace limit.
ef548879 1048 */
4afeea0d
MD
1049int bioq_reorder_burst_interval = 60; /* should be multiple of minor */
1050SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_burst_interval,
1051 CTLFLAG_RW, &bioq_reorder_burst_interval, 0, "");
1052int bioq_reorder_minor_interval = 5;
1053SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_minor_interval,
1054 CTLFLAG_RW, &bioq_reorder_minor_interval, 0, "");
1055
1056int bioq_reorder_burst_bytes = 3000000;
1057SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_burst_bytes,
1058 CTLFLAG_RW, &bioq_reorder_burst_bytes, 0, "");
1059int bioq_reorder_minor_bytes = 262144;
1060SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_minor_bytes,
1061 CTLFLAG_RW, &bioq_reorder_minor_bytes, 0, "");
ef548879 1062
7a9e53ad
MD
1063
1064/*
30e5862e
MD
1065 * Order I/Os. Generally speaking this code is designed to make better
1066 * use of drive zone caches. A drive zone cache can typically track linear
1067 * reads or writes for around 16 zones simultaniously.
7a9e53ad 1068 *
30e5862e
MD
1069 * Read prioritization issues: It is possible for hundreds of megabytes worth
1070 * of writes to be queued asynchronously. This creates a huge bottleneck
1071 * for reads which reduce read bandwidth to a trickle.
7a9e53ad 1072 *
4afeea0d
MD
1073 * To solve this problem we generally reorder reads before writes.
1074 *
1075 * However, a large number of random reads can also starve writes and
1076 * make poor use of the drive zone cache so we allow writes to trickle
1077 * in every N reads.
7a9e53ad
MD
1078 */
1079void
81b5c339 1080bioqdisksort(struct bio_queue_head *bioq, struct bio *bio)
7a9e53ad 1081{
4afeea0d
MD
1082 /*
1083 * The BIO wants to be ordered. Adding to the tail also
1084 * causes transition to be set to NULL, forcing the ordering
1085 * of all prior I/O's.
1086 */
1087 if (bio->bio_buf->b_flags & B_ORDERED) {
1088 bioq_insert_tail(bioq, bio);
1089 return;
1090 }
1091
30e5862e
MD
1092 switch(bio->bio_buf->b_cmd) {
1093 case BUF_CMD_READ:
1094 if (bioq->transition) {
7a9e53ad 1095 /*
4afeea0d
MD
1096 * Insert before the first write. Bleedover writes
1097 * based on reorder intervals to prevent starvation.
7a9e53ad 1098 */
30e5862e 1099 TAILQ_INSERT_BEFORE(bioq->transition, bio, bio_act);
4afeea0d
MD
1100 ++bioq->reorder;
1101 if (bioq->reorder % bioq_reorder_minor_interval == 0) {
30e5862e 1102 bioqwritereorder(bioq);
4afeea0d
MD
1103 if (bioq->reorder >=
1104 bioq_reorder_burst_interval) {
1105 bioq->reorder = 0;
1106 }
7a9e53ad
MD
1107 }
1108 } else {
7a9e53ad 1109 /*
30e5862e
MD
1110 * No writes queued (or ordering was forced),
1111 * insert at tail.
7a9e53ad 1112 */
30e5862e 1113 TAILQ_INSERT_TAIL(&bioq->queue, bio, bio_act);
7a9e53ad 1114 }
30e5862e
MD
1115 break;
1116 case BUF_CMD_WRITE:
1117 /*
1118 * Writes are always appended. If no writes were previously
1119 * queued or an ordered tail insertion occured the transition
1120 * field will be NULL.
1121 */
1122 TAILQ_INSERT_TAIL(&bioq->queue, bio, bio_act);
1123 if (bioq->transition == NULL)
1124 bioq->transition = bio;
1125 break;
1126 default:
1127 /*
1128 * All other request types are forced to be ordered.
1129 */
1130 bioq_insert_tail(bioq, bio);
4afeea0d 1131 break;
7a9e53ad 1132 }
30e5862e 1133}
7a9e53ad 1134
30e5862e 1135/*
4afeea0d
MD
1136 * Move the read-write transition point to prevent reads from
1137 * completely starving our writes. This brings a number of writes into
30e5862e 1138 * the fold every N reads.
4afeea0d
MD
1139 *
1140 * We bring a few linear writes into the fold on a minor interval
1141 * and we bring a non-linear burst of writes into the fold on a major
1142 * interval. Bursting only occurs if runningbufspace is really high
1143 * (typically from syncs, fsyncs, or HAMMER flushes).
30e5862e
MD
1144 */
1145static
1146void
1147bioqwritereorder(struct bio_queue_head *bioq)
1148{
1149 struct bio *bio;
1150 off_t next_offset;
4afeea0d 1151 size_t left;
30e5862e 1152 size_t n;
4afeea0d
MD
1153 int check_off;
1154
1155 if (bioq->reorder < bioq_reorder_burst_interval ||
1156 !buf_runningbufspace_severe()) {
1157 left = (size_t)bioq_reorder_minor_bytes;
1158 check_off = 1;
1159 } else {
1160 left = (size_t)bioq_reorder_burst_bytes;
1161 check_off = 0;
1162 }
30e5862e
MD
1163
1164 next_offset = bioq->transition->bio_offset;
1165 while ((bio = bioq->transition) != NULL &&
4afeea0d
MD
1166 (check_off == 0 || next_offset == bio->bio_offset)
1167 ) {
30e5862e
MD
1168 n = bio->bio_buf->b_bcount;
1169 next_offset = bio->bio_offset + n;
1170 bioq->transition = TAILQ_NEXT(bio, bio_act);
1171 if (left < n)
7a9e53ad 1172 break;
30e5862e 1173 left -= n;
7a9e53ad 1174 }
7a9e53ad
MD
1175}
1176
7a9e53ad 1177/*
c6ef65ea
AH
1178 * Bounds checking against the media size, used for the raw partition.
1179 * secsize, mediasize and b_blkno must all be the same units.
1180 * Possibly this has to be DEV_BSIZE (512).
1181 */
1182int
1183bounds_check_with_mediasize(struct bio *bio, int secsize, uint64_t mediasize)
1184{
1185 struct buf *bp = bio->bio_buf;
1186 int64_t sz;
1187
1188 sz = howmany(bp->b_bcount, secsize);
1189
1190 if (bio->bio_offset/DEV_BSIZE + sz > mediasize) {
1191 sz = mediasize - bio->bio_offset/DEV_BSIZE;
1192 if (sz == 0) {
1193 /* If exactly at end of disk, return EOF. */
1194 bp->b_resid = bp->b_bcount;
1195 return 0;
1196 }
1197 if (sz < 0) {
1198 /* If past end of disk, return EINVAL. */
1199 bp->b_error = EINVAL;
1200 return 0;
1201 }
1202 /* Otherwise, truncate request. */
1203 bp->b_bcount = sz * secsize;
1204 }
1205
1206 return 1;
1207}
1208
1209/*
7a9e53ad
MD
1210 * Disk error is the preface to plaintive error messages
1211 * about failing disk transfers. It prints messages of the form
1212
1213hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
1214
1215 * if the offset of the error in the transfer and a disk label
1216 * are both available. blkdone should be -1 if the position of the error
1217 * is unknown; the disklabel pointer may be null from drivers that have not
6ea70f76 1218 * been converted to use them. The message is printed with kprintf
7a9e53ad 1219 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
6ea70f76 1220 * The message should be completed (with at least a newline) with kprintf
a0a36cfd 1221 * or log(-1, ...), respectively. There is no trailing space.
7a9e53ad
MD
1222 */
1223void
a688b15c 1224diskerr(struct bio *bio, cdev_t dev, const char *what, int pri, int donecnt)
7a9e53ad 1225{
81b5c339 1226 struct buf *bp = bio->bio_buf;
c6f49b01
MD
1227 const char *term;
1228
1229 switch(bp->b_cmd) {
1230 case BUF_CMD_READ:
1231 term = "read";
1232 break;
1233 case BUF_CMD_WRITE:
1234 term = "write";
1235 break;
1236 default:
1237 term = "access";
1238 break;
1239 }
cd29885a 1240 kprintf("%s: %s %sing ", dev->si_name, what, term);
973c11b9
MD
1241 kprintf("offset %012llx for %d",
1242 (long long)bio->bio_offset,
1243 bp->b_bcount);
cd29885a 1244
54078292 1245 if (donecnt)
6ea70f76 1246 kprintf(" (%d bytes completed)", donecnt);
7a9e53ad 1247}
81b5c339 1248
a8873631
MD
1249/*
1250 * Locate a disk device
1251 */
1252cdev_t
1253disk_locate(const char *devname)
1254{
cd29885a
MD
1255 return devfs_find_device_by_name(devname);
1256}
a8873631 1257
cd29885a
MD
1258void
1259disk_config(void *arg)
1260{
aec8eea4 1261 disk_msg_send_sync(DISK_SYNC, NULL, NULL);
cd29885a
MD
1262}
1263
cd29885a
MD
1264static void
1265disk_init(void)
1266{
1267 struct thread* td_core;
cd29885a 1268
149e86b9
MD
1269 disk_msg_cache = objcache_create("disk-msg-cache", 0, 0,
1270 NULL, NULL, NULL,
1271 objcache_malloc_alloc,
1272 objcache_malloc_free,
1273 &disk_msg_malloc_args);
cd29885a 1274
b37f18d6 1275 lwkt_token_init(&disklist_token, 1, "disks");
be755ff9 1276
149e86b9
MD
1277 /*
1278 * Initialize the reply-only port which acts as a message drain
1279 */
cd29885a
MD
1280 lwkt_initport_replyonly(&disk_dispose_port, disk_msg_autofree_reply);
1281
c9e9fb21 1282 lwkt_gettoken(&disklist_token);
cd29885a 1283 lwkt_create(disk_msg_core, /*args*/NULL, &td_core, NULL,
c9e9fb21 1284 TDF_MPSAFE, 0, "disk_msg_core");
cd29885a 1285 tsleep(td_core, 0, "diskcore", 0);
c9e9fb21 1286 lwkt_reltoken(&disklist_token);
cd29885a
MD
1287}
1288
cd29885a
MD
1289static void
1290disk_uninit(void)
1291{
cd29885a 1292 objcache_destroy(disk_msg_cache);
a8873631
MD
1293}
1294
fbbbca99
MD
1295/*
1296 * Clean out illegal characters in serial numbers.
1297 */
1298static void
1299disk_cleanserial(char *serno)
1300{
1301 char c;
1302
1303 while ((c = *serno) != 0) {
1304 if (c >= 'a' && c <= 'z')
1305 ;
1306 else if (c >= 'A' && c <= 'Z')
1307 ;
1308 else if (c >= '0' && c <= '9')
1309 ;
1310 else if (c == '-' || c == '@' || c == '+' || c == '.')
1311 ;
1312 else
1313 c = '_';
1314 *serno++= c;
1315 }
1316}
1317
8c05caab
AH
1318TUNABLE_INT("kern.disk_debug", &disk_debug_enable);
1319SYSCTL_INT(_kern, OID_AUTO, disk_debug, CTLFLAG_RW, &disk_debug_enable,
1320 0, "Enable subr_disk debugging");
1321
cd29885a
MD
1322SYSINIT(disk_register, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, disk_init, NULL);
1323SYSUNINIT(disk_register, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, disk_uninit, NULL);