2 * Copyright (c) 1997, 1998, 1999
3 * Nan Yang Computer Services Limited. All rights reserved.
5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
7 * Written by Greg Lehey
9 * This software is distributed under the so-called ``Berkeley
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by Nan Yang Computer
24 * 4. Neither the name of the Company nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
28 * This software is provided ``as is'', and any express or implied
29 * warranties, including, but not limited to, the implied warranties of
30 * merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall the company or contributors be liable for any
32 * direct, indirect, incidental, special, exemplary, or consequential
33 * damages (including, but not limited to, procurement of substitute
34 * goods or services; loss of use, data, or profits; or business
35 * interruption) however caused and on any theory of liability, whether
36 * in contract, strict liability, or tort (including negligence or
37 * otherwise) arising in any way out of the use of this software, even if
38 * advised of the possibility of such damage.
41 #include "opt_vinum.h"
44 #include "vinumstate.h"
47 * A disk block number or offset
49 typedef int64_t vinum_off_t;
52 * Some configuration maxima. They're an enum because
53 * we can't define global constants. Sorry about that.
55 * These aren't as bad as they look: most of them are soft limits.
60 VINUM_HEADER = 512, /* size of header on disk */
61 MAXCONFIGLINE = 1024, /* maximum size of one config line */
62 MINVINUMSLICE = 1048576, /* minimum size of a slice */
64 ROUND_ROBIN_READPOL = -1, /* round robin read policy */
67 * type field in minor number
69 VINUM_VOLUME_TYPE = 0,
73 VINUM_SUPERDEV_TYPE = 4, /* super device. */
74 VINUM_RAWPLEX_TYPE = 5, /* anonymous plex */
75 VINUM_RAWSD_TYPE = 6, /* anonymous subdisk */
78 * Shifts for the individual fields in the device
80 VINUM_TYPE_SHIFT = 28,
82 VINUM_PLEX_SHIFT = 16,
89 * Shifts for the second half of raw plex and
92 VINUM_RAWPLEX_SHIFT = 8, /* shift the second half this much */
93 VINUM_RAWPLEX_WIDTH = 12, /* width of second half */
97 MAXPLEX = 8, /* max number of plexes in a volume */
98 MAXSD = 256, /* max number of subdisks in a plex */
99 MAXDRIVENAME = 32, /* max length of a device name */
100 MAXSDNAME = 64, /* max length of a subdisk name */
101 MAXPLEXNAME = 64, /* max length of a plex name */
102 MAXVOLNAME = 64, /* max length of a volume name */
103 MAXNAME = 64, /* max length of any name */
107 * Define a minor device number.
108 * This is not used directly; instead, it's
109 * called by the other macros.
111 #define VINUMMINOR(v,p,s,t) ( (v << VINUM_VOL_SHIFT) \
112 | (p << VINUM_PLEX_SHIFT) \
113 | (s << VINUM_SD_SHIFT) \
114 | (t << VINUM_TYPE_SHIFT) )
116 /* Create device minor numbers */
120 #define VINUMDEV(v,p,s,t) \
121 VINUMMINOR (v, p, s, t)
123 #define VINUM_PLEX(p) \
124 ((VINUM_RAWPLEX_TYPE << VINUM_TYPE_SHIFT) \
126 | ((p & ~0xff) << 8))
128 #define VINUM_SD(s) \
129 ((VINUM_RAWSD_TYPE << VINUM_TYPE_SHIFT) \
131 | ((s & ~0xff) << 8))
135 /* Create a bit mask for x bits */
136 #define MASK(x) ((1 << (x)) - 1)
138 /* Create a raw block device minor number */
139 #define VINUMRMINOR(d,t) \
140 ( ((d & MASK(VINUM_VOL_WIDTH)) << VINUM_VOL_SHIFT) \
141 | ((d & ~MASK(VINUM_VOL_WIDTH)) << \
142 (VINUM_PLEX_SHIFT + VINUM_VOL_WIDTH)) \
143 | (t << VINUM_TYPE_SHIFT) )
145 /* extract device type */
146 #define DEVTYPE(x) ((minor (x) >> VINUM_TYPE_SHIFT) & 7)
149 * This mess is used to catch people who compile
150 * a debug vinum(8) and non-debug kernel module,
151 * or the other way round.
156 /* superdevice number */
157 #define VINUM_SUPERDEV VINUMMINOR(1, 0, 0, VINUM_SUPERDEV_TYPE)
159 /* non-debug superdevice number */
160 #define VINUM_WRONGSUPERDEV VINUMMINOR(2, 0, 0, VINUM_SUPERDEV_TYPE)
164 /* superdevice number */
165 #define VINUM_SUPERDEV VINUMMINOR(2, 0, 0, VINUM_SUPERDEV_TYPE)
167 /* debug superdevice number */
168 #define VINUM_WRONGSUPERDEV VINUMMINOR(1, 0, 0, VINUM_SUPERDEV_TYPE)
172 /* daemon superdevice number */
173 #define VINUM_DAEMON_DEV VINUMMINOR(0, 0, 0, VINUM_SUPERDEV_TYPE)
176 * the number of object entries to cater for initially, and also the
177 * value by which they are incremented. It doesn't take long
178 * to extend them, so theoretically we could start with 1 of each, but
179 * it's untidy to allocate such small areas. These values are
180 * probably too small.
186 INITIAL_SUBDISKS = 16,
187 INITIAL_SUBDISKS_IN_PLEX = 4, /* num subdisks to alloc to a plex */
188 INITIAL_SUBDISKS_IN_DRIVE = 4, /* num subdisks to alloc to a drive */
189 INITIAL_DRIVE_FREELIST = 16, /* num entries in drive freelist */
190 PLEX_REGION_TABLE_SIZE = 8, /* num entries in plex region tables */
191 PLEX_LOCKS = 256, /* num locks to alloc to a plex */
192 MAX_REVIVE_BLOCKSIZE = MAXPHYS, /* maximum revive block size */
193 DEFAULT_REVIVE_BLOCKSIZE = 65536,/* default revive block size */
194 VINUMHOSTNAMELEN = 32, /* host name field in label */
200 * 31 30 28 27 20 19 18 16 15 8 7 0
201 * |---------------------------------------------------------------------|
202 * |X | Type | Subdisk number | X| Plex | Major | volno |
203 * |---------------------------------------------------------------------|
207 * The fields in the minor number are interpreted as follows:
209 * Volume: Only type and volume number are relevant
210 * Plex in volume: type, plex number in volume and volume number
212 * raw plex: type, plex number is made of bits 27-16 and 7-0
213 * raw subdisk: type, subdisk number is made of bits 27-16 and 7-0
218 /* This doesn't get used. Consider removing it. */
221 * CARE. These fields assume a big-endian word. On a
222 * little-endian system, they're the wrong way around
224 unsigned volume:8; /* up to 256 volumes */
225 unsigned major:8; /* major number fits */
226 unsigned plex:3; /* up to 8 plexes per volume */
227 unsigned unused:1; /* up for grabs */
228 unsigned sd:8; /* up to 256 subdisks per plex */
229 unsigned type:3; /* type of object */
239 unsigned signbit:1; /* to make 32 bits */
244 #define VINUM_BASE "vinum/"
245 #define VINUM_DIR "/dev/vinum"
248 * These definitions help catch
249 * userland/kernel mismatches.
253 /* normal super device */
254 #define VINUM_WRONGSUPERDEV_NAME VINUM_DIR "/control"
255 #define VINUM_WRONGSUPERDEV_BASE VINUM_BASE "control"
257 /* debug super device */
258 #define VINUM_SUPERDEV_NAME VINUM_DIR "/Control"
259 #define VINUM_SUPERDEV_BASE VINUM_BASE "Control"
263 /* debug super device */
264 #define VINUM_WRONGSUPERDEV_NAME VINUM_DIR "/Control"
265 #define VINUM_WRONGSUPERDEV_BASE VINUM_BASE "Control"
267 /* normal super device */
268 #define VINUM_SUPERDEV_NAME VINUM_DIR "/control"
269 #define VINUM_SUPERDEV_BASE VINUM_BASE "control"
273 /* super device for daemon only */
274 #define VINUM_DAEMON_DEV_NAME VINUM_DIR "/controld"
275 #define VINUM_DAEMON_DEV_BASE VINUM_BASE "controld"
278 * Flags for all objects. Most of them only apply to
279 * specific objects, but we have space for all in any
283 VF_LOCKED = 1, /* locked access to this object */
284 VF_LOCKING = 2, /* we want access to this object */
285 VF_OPEN = 4, /* object has openers */
286 VF_WRITETHROUGH = 8, /* volume: write through */
287 VF_INITED = 0x10, /* unit has been initialized */
289 /* 0x20 unused, was: VF_WLABEL: label area is writable */
290 VF_LABELLING = 0x40, /* unit is currently being labelled */
291 VF_WANTED = 0x80, /* waiting to obtain a lock */
292 VF_RAW = 0x100, /* raw volume (no file system) */
293 VF_LOADED = 0x200, /* module is loaded */
294 VF_CONFIGURING = 0x400, /* someone is changing the config */
295 VF_WILL_CONFIGURE = 0x800, /* someone wants to change the config */
296 VF_CONFIG_INCOMPLETE = 0x1000, /* not finished changing the config */
297 VF_CONFIG_SETUPSTATE = 0x2000, /* set a vol up if all plexes empty */
298 VF_READING_CONFIG = 0x4000, /* reading config database from disk */
299 VF_FORCECONFIG = 0x8000, /* config drives even with diff names */
300 VF_NEWBORN = 0x10000, /* for objects: we've just created it */
301 VF_CONFIGURED = 0x20000, /* for drives: we read the config */
302 VF_STOPPING = 0x40000, /* for vinum_conf: stop on last close */
304 VF_DAEMONOPEN = 0x80000, /* the daemon has us open (only
307 VF_CREATED = 0x100000, /* for vols: freshly created,
309 VF_HOTSPARE = 0x200000, /* for drives: use as hot spare */
310 VF_RETRYERRORS = 0x400000, /* don't down subdisks on I/O errors */
313 /* Global configuration information for the vinum subsystem */
315 /* Pointers to vinum structures */
319 struct volume *volume;
321 /* the number allocated */
322 int drives_allocated;
323 int subdisks_allocated;
324 int plexes_allocated;
325 int volumes_allocated;
327 /* and the number currently in use */
335 #define VINUM_MAXACTIVE 30000 /* max number of active requests */
336 int active; /* current number of requests outstanding */
337 int maxactive; /* max number of requests ever outstanding */
339 struct request *lastrq;
345 /* Use these defines to simplify code */
346 #define DRIVE vinum_conf.drive
347 #define SD vinum_conf.sd
348 #define PLEX vinum_conf.plex
349 #define VOL vinum_conf.volume
350 #define VFLAGS vinum_conf.flags
355 * Vinum drives start with this structure:
358 * |--------------------------------------|
359 * | PDP-11 memorial boot block | 0
360 * |--------------------------------------|
361 * | Disk label, maybe | 1
362 * |--------------------------------------|
363 * | Slice definition (vinum_hdr) | 8
364 * |--------------------------------------|
366 * | Configuration info, first copy | 9
368 * |--------------------------------------|
370 * | Configuration info, second copy | 9 + size of config
372 * |--------------------------------------|
376 * Sizes and offsets of our information
379 VINUM_LABEL_OFFSET = 4096, /* offset of vinum label */
380 VINUMHEADERLEN = 512, /* size of vinum label */
381 VINUM_CONFIG_OFFSET = 4608, /* offset of first config copy */
382 MAXCONFIG = 65536, /* and size of config copy */
384 /* this is where the data starts */
385 DATASTART = (MAXCONFIG * 2 + VINUM_CONFIG_OFFSET) / DEV_BSIZE
389 * hostname is 256 bytes long, but we don't need to shlep
390 * multiple copies in vinum. We use the host name just
391 * to identify this system, and 32 bytes should be ample
396 char sysname[VINUMHOSTNAMELEN]; /* system name at time of creation */
397 char name[MAXDRIVENAME]; /* our name of the drive */
398 struct timeval date_of_birth; /* the time it was created */
399 struct timeval last_update; /* and the time of last update */
401 * total size in bytes of the drive. This value
402 * includes the headers.
408 uint64_t magic; /* we're long on magic numbers */
411 * Size in bytes of each copy of the
412 * configuration info. This must be a multiple
413 * of the sector size.
416 struct vinum_label label; /* unique label */
420 #define VINUM_MAGIC 22322600044678729LL
422 /* becomes this after obliteration */
423 #define VINUM_NOMAGIC 22322600044678990LL
425 /* Information returned from read_drive_label */
426 enum drive_label_info {
427 DL_CANT_OPEN, /* invalid partition */
428 DL_NOT_OURS, /* valid part, but no vinum label */
429 DL_DELETED_LABEL, /* valid part, deleted label found */
430 DL_WRONG_DRIVE, /* drive name doesn't match */
431 DL_OURS /* valid partition and label found */
434 /*** Drive definitions ***/
436 * A drive corresponds to a disk slice. We use a different term to show
437 * the difference in usage: it doesn't have to be a slice, and could
438 * theoretically be a complete, unpartitioned disk
442 char devicename[MAXDRIVENAME]; /* name of the slice it's on */
443 enum drivestate state; /* current state */
444 int flags; /* flags */
445 int subdisks_allocated; /* number of entries in sd */
446 int subdisks_used; /* and the number used */
447 int blocksize; /* size of fs blocks */
448 int pid; /* of locker */
449 u_int64_t sectors_available; /* number of sectors still available */
451 int lasterror; /* last error on drive */
452 int driveno; /* index of drive in vinum_conf */
453 int opencount; /* number of up subdisks */
454 u_int64_t reads; /* number of reads on this drive */
455 u_int64_t writes; /* number of writes on this drive */
456 u_int64_t bytes_read; /* number of bytes read */
457 u_int64_t bytes_written; /* number of bytes written */
458 struct vinum_label label; /* and the label information */
459 #define DRIVE_MAXACTIVE 30000 /* maximum number of active requests */
460 int active; /* current number of reqs outstanding */
461 int maxactive; /* max num of reqs ever outstanding */
462 int freelist_size; /* entries alloced in free list */
463 int freelist_entries; /* entries used in free list */
464 struct drive_freelist { /* sorted list of free space on drive */
465 u_int64_t offset; /* offset of entry */
466 u_int64_t sectors; /* and length in sectors */
468 struct partinfo partinfo; /* partition information */
469 /* XXX kludge until we get this struct cleaned up */
478 char lockfilename[16]; /* locked with file */
479 int lockline; /* and the line number */
483 /*** Subdisk definitions ***/
486 char name[MAXSDNAME]; /* name of subdisk */
487 enum sdstate state; /* state */
489 int lasterror; /* last error occurred */
490 /* offsets in blocks */
491 int64_t driveoffset; /* offset on drive */
499 * plexoffset is the offset from the beginning
500 * of the plex to the very first part of the
501 * subdisk, in sectors. For striped, RAID-4 and
502 * RAID-5 plexes, only the first stripe is
503 * located at this offset
505 int64_t plexoffset; /* offset in plex */
506 u_int64_t sectors; /* and length in sectors */
507 int plexno; /* index of plex, if it belongs */
508 int driveno; /* index of the drive */
509 int sdno; /* our index in vinum_conf */
510 int plexsdno; /* and our number in our plex */
511 /* (undefined if no plex) */
512 u_int64_t reads; /* number of reads on this subdisk */
513 u_int64_t writes; /* number of writes on this subdisk */
514 u_int64_t bytes_read; /* number of bytes read */
515 u_int64_t bytes_written; /* number of bytes written */
516 /* revive parameters */
517 u_int64_t revived; /* blkno of current revive request */
518 int revive_blocksize; /* revive block size (bytes) */
519 int revive_interval; /* and time to wait between transfers */
520 pid_t reviver; /* PID of reviving process */
521 /* init parameters */
522 u_int64_t initialized; /* blkno of current init request */
523 int init_blocksize; /* init block size (bytes) */
524 int init_interval; /* time to wait between transfers */
525 struct request *waitlist; /* list of reqs waiting on revive op */
528 /*** Plex definitions ***/
530 /* kinds of plex organization */
532 plex_disorg, /* disorganized */
533 plex_concat, /* concatenated plex */
534 plex_striped, /* striped plex */
535 plex_raid4, /* RAID4 plex */
536 plex_raid5 /* RAID5 plex */
539 /* Recognize plex organizations */
541 #define isstriped(p) (p->organization >= plex_striped)
544 #define isparity(p) (p->organization >= plex_raid4)
547 char name[MAXPLEXNAME]; /* name of plex */
548 enum plexorg organization; /* Plex organization */
549 enum plexstate state; /* and current state */
553 void *plex_dev_dummy;
555 u_int64_t length; /* total length of plex (sectors) */
557 int stripesize; /* size of stripe or raid band,
559 int subdisks; /* number of associated subdisks */
560 int subdisks_allocated; /* number of subdisks allocated
562 int *sdnos; /* list of component subdisks */
563 int plexno; /* index of plex in vinum_conf */
564 int volno; /* index of volume */
565 int volplexno; /* number of plex in volume */
567 u_int64_t reads; /* number of reads on this plex */
568 u_int64_t writes; /* number of writes on this plex */
569 u_int64_t bytes_read; /* number of bytes read */
570 u_int64_t bytes_written; /* number of bytes written */
571 u_int64_t recovered_reads; /* number of recovered read
573 u_int64_t degraded_writes; /* number of degraded writes */
574 u_int64_t parityless_writes; /* number of parityless writes */
575 u_int64_t multiblock; /* requests that needed more than
577 u_int64_t multistripe; /* requests that needed more than
579 int sddowncount; /* number of subdisks down */
581 /* Lock information */
582 int usedlocks; /* number currently in use */
583 int lockwaits; /* and number of waits for locks */
584 off_t checkblock; /* block number for parity op */
585 struct rangelock *lock; /* ranges of locked addresses */
588 /*** Volume definitions ***/
590 /* Address range definitions, for locking volumes */
592 vinum_off_t stripe; /* address + 1 of the range being locked */
593 struct buf *bp; /* user's buffer pointer */
597 char name[MAXVOLNAME]; /* name of volume */
598 enum volumestate state; /* current state */
599 int plexes; /* number of plexes */
600 int preferred_plex; /* plex to read from, -1 for
609 * index of plex used for last read, for
613 int volno; /* volume number */
614 int flags; /* status and configuration flags */
615 int openflags; /* flags supplied to last open(2) */
616 u_int64_t size; /* size of volume */
617 int blocksize; /* logical block size */
618 int active; /* number of outstanding
620 int subops; /* and the number of suboperations */
622 u_int64_t bytes_read; /* number of bytes read */
623 u_int64_t bytes_written; /* number of bytes written */
624 u_int64_t reads; /* number of reads on this volume */
625 u_int64_t writes; /* number of writes on this volume */
626 u_int64_t recovered_reads; /* reads recovered from another plex */
629 * Unlike subdisks in the plex, space for the
630 * plex pointers is static.
632 int plex[MAXPLEX]; /* index of plexes */
636 * Table expansion. Expand table, which contains oldcount
637 * entries of type element, by increment entries, and change
638 * oldcount accordingly
640 #define EXPAND(table, element, oldcount, increment) \
642 expand_table((void **) &table, \
643 oldcount * sizeof (element), \
644 (oldcount + increment) * sizeof (element)); \
645 oldcount += increment; \
649 * Information on vinum's memory usage
652 int mallocs; /* number of malloced blocks */
653 int total_malloced; /* total amount malloced */
654 int highwater; /* maximum number of mallocs */
655 struct mc *malloced; /* pointer to kernel table */
658 #define MCFILENAMELEN 16
665 char file[MCFILENAMELEN];
669 * These enums are used by the state transition
670 * routines. They're in bit map format:
672 * Bit 0: Other plexes in the volume are down
673 * Bit 1: Other plexes in the volume are up
674 * Bit 2: The current plex is up
675 * Maybe they should be local to
679 volplex_onlyusdown = 0, /* 0: we're the only plex,
681 volplex_alldown, /* 1: another plex is down,
683 volplex_otherup, /* 2: another plex is up */
684 volplex_otherupdown, /* 3: other plexes are up and down */
685 volplex_onlyus, /* 4: we're up and alone */
686 volplex_onlyusup, /* 5: only we are up, others are down */
687 volplex_allup, /* 6: all plexes are up */
688 volplex_someup /* 7: some plexes are up,
692 /* state map for plex */
695 sd_downstate = 2, /* SD is down */
696 sd_crashedstate = 4, /* SD is crashed */
697 sd_obsoletestate = 8, /* SD is obsolete */
698 sd_stalestate = 16, /* SD is stale */
699 sd_rebornstate = 32, /* SD is reborn */
700 sd_upstate = 64, /* SD is up */
701 sd_initstate = 128, /* SD is initializing */
702 sd_initializedstate = 256, /* SD is initialized */
703 sd_otherstate = 512, /* SD is in some other state */
707 * This is really just a parameter to pass to
708 * set_<foo>_state, but since it needs to be known
709 * in the external definitions, we need to define
713 setstate_none = 0, /* no flags */
714 setstate_force = 1, /* force the state change */
715 setstate_configuring = 2, /* we're currently configuring,
719 /* Operations for parityops to perform. */
723 rebuildandcheckparity, /* rebuildparity with the -v option */
732 DEBUG_ADDRESSES = 1, /* show buffer information during
734 DEBUG_NUMOUTPUT = 2, /* show the value of vp->v_numoutput */
735 DEBUG_RESID = 4, /* go into debugger in complete_rqe */
736 DEBUG_LASTREQS = 8, /* keep a circular buffer of
738 DEBUG_REVIVECONFLICT = 16, /* print info about revive conflicts */
739 DEBUG_EOFINFO = 32, /* print info about EOF detection */
740 DEBUG_MEMFREE = 64, /* keep info about Frees */
741 DEBUG_BIGDRIVE = 128, /* pretend our drives are 100 times
743 DEBUG_REMOTEGDB = 256, /* go into remote gdb */
744 DEBUG_WARNINGS = 512, /* log various relatively
745 * harmless warnings */
750 #define longjmp LongJmp /* test our longjmps */