Commit | Line | Data |
---|---|---|
984263bc MD |
1 | /*- |
2 | * Copyright (c) 1997, 1998, 1999 | |
3 | * Nan Yang Computer Services Limited. All rights reserved. | |
4 | * | |
5 | * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. | |
6 | * | |
7 | * Written by Greg Lehey | |
8 | * | |
9 | * This software is distributed under the so-called ``Berkeley | |
10 | * License'': | |
11 | * | |
12 | * Redistribution and use in source and binary forms, with or without | |
13 | * modification, are permitted provided that the following conditions | |
14 | * are met: | |
15 | * 1. Redistributions of source code must retain the above copyright | |
16 | * notice, this list of conditions and the following disclaimer. | |
17 | * 2. Redistributions in binary form must reproduce the above copyright | |
18 | * notice, this list of conditions and the following disclaimer in the | |
19 | * documentation and/or other materials provided with the distribution. | |
20 | * 3. All advertising materials mentioning features or use of this software | |
21 | * must display the following acknowledgement: | |
22 | * This product includes software developed by Nan Yang Computer | |
23 | * Services Limited. | |
24 | * 4. Neither the name of the Company nor the names of its contributors | |
25 | * may be used to endorse or promote products derived from this software | |
26 | * without specific prior written permission. | |
27 | * | |
28 | * This software is provided ``as is'', and any express or implied | |
29 | * warranties, including, but not limited to, the implied warranties of | |
30 | * merchantability and fitness for a particular purpose are disclaimed. | |
31 | * In no event shall the company or contributors be liable for any | |
32 | * direct, indirect, incidental, special, exemplary, or consequential | |
33 | * damages (including, but not limited to, procurement of substitute | |
34 | * goods or services; loss of use, data, or profits; or business | |
35 | * interruption) however caused and on any theory of liability, whether | |
36 | * in contract, strict liability, or tort (including negligence or | |
37 | * otherwise) arising in any way out of the use of this software, even if | |
38 | * advised of the possibility of such damage. | |
984263bc MD |
39 | */ |
40 | ||
29bcf169 | 41 | #ifdef _KERNEL |
752b2d38 | 42 | #include "opt_vinum.h" |
29bcf169 | 43 | #endif |
752b2d38 | 44 | |
984263bc | 45 | #include <sys/time.h> |
1f2de5d4 | 46 | #include "vinumstate.h" |
984263bc | 47 | |
c2fcd54e MD |
48 | /* |
49 | * A disk block number or offset | |
50 | */ | |
51 | typedef int64_t vinum_off_t; | |
52 | ||
984263bc MD |
53 | /* |
54 | * Some configuration maxima. They're an enum because | |
55 | * we can't define global constants. Sorry about that. | |
56 | * | |
57 | * These aren't as bad as they look: most of them are soft limits. | |
58 | */ | |
59 | ||
60 | #define VINUMROOT | |
61 | enum constants { | |
7c8f38d4 MD |
62 | VINUM_HEADER = 512, /* size of header on disk */ |
63 | MAXCONFIGLINE = 1024, /* maximum size of one config line */ | |
64 | MINVINUMSLICE = 1048576, /* minimum size of a slice */ | |
65 | ||
7c8f38d4 MD |
66 | ROUND_ROBIN_READPOL = -1, /* round robin read policy */ |
67 | ||
68 | /* | |
69 | * type field in minor number | |
70 | */ | |
71 | VINUM_VOLUME_TYPE = 0, | |
72 | VINUM_PLEX_TYPE = 1, | |
73 | VINUM_SD_TYPE = 2, | |
74 | VINUM_DRIVE_TYPE = 3, | |
75 | VINUM_SUPERDEV_TYPE = 4, /* super device. */ | |
76 | VINUM_RAWPLEX_TYPE = 5, /* anonymous plex */ | |
77 | VINUM_RAWSD_TYPE = 6, /* anonymous subdisk */ | |
78 | ||
79 | /* | |
80 | * Shifts for the individual fields in the device | |
81 | */ | |
82 | VINUM_TYPE_SHIFT = 28, | |
83 | VINUM_VOL_SHIFT = 0, | |
84 | VINUM_PLEX_SHIFT = 16, | |
85 | VINUM_SD_SHIFT = 20, | |
86 | VINUM_VOL_WIDTH = 8, | |
87 | VINUM_PLEX_WIDTH = 3, | |
88 | VINUM_SD_WIDTH = 8, | |
89 | ||
90 | /* | |
91 | * Shifts for the second half of raw plex and | |
92 | * subdisk numbers | |
93 | */ | |
94 | VINUM_RAWPLEX_SHIFT = 8, /* shift the second half this much */ | |
95 | VINUM_RAWPLEX_WIDTH = 12, /* width of second half */ | |
96 | ||
97 | MAJORDEV_SHIFT = 8, | |
98 | ||
99 | MAXPLEX = 8, /* max number of plexes in a volume */ | |
100 | MAXSD = 256, /* max number of subdisks in a plex */ | |
101 | MAXDRIVENAME = 32, /* max length of a device name */ | |
102 | MAXSDNAME = 64, /* max length of a subdisk name */ | |
103 | MAXPLEXNAME = 64, /* max length of a plex name */ | |
104 | MAXVOLNAME = 64, /* max length of a volume name */ | |
105 | MAXNAME = 64, /* max length of any name */ | |
106 | ||
107 | ||
108 | /* | |
109 | * Define a minor device number. | |
110 | * This is not used directly; instead, it's | |
111 | * called by the other macros. | |
112 | */ | |
113 | #define VINUMMINOR(v,p,s,t) ( (v << VINUM_VOL_SHIFT) \ | |
984263bc MD |
114 | | (p << VINUM_PLEX_SHIFT) \ |
115 | | (s << VINUM_SD_SHIFT) \ | |
116 | | (t << VINUM_TYPE_SHIFT) ) | |
117 | ||
e4c9c0c8 | 118 | /* Create device minor numbers */ |
984263bc | 119 | |
e4c9c0c8 | 120 | #ifdef _KERNEL |
984263bc | 121 | |
e4c9c0c8 | 122 | #define VINUMDEV(v,p,s,t) \ |
d736a600 | 123 | VINUMMINOR (v, p, s, t) |
e4c9c0c8 MD |
124 | |
125 | #define VINUM_PLEX(p) \ | |
d736a600 | 126 | ((VINUM_RAWPLEX_TYPE << VINUM_TYPE_SHIFT) \ |
e4c9c0c8 | 127 | | (p & 0xff) \ |
d736a600 | 128 | | ((p & ~0xff) << 8)) |
e4c9c0c8 MD |
129 | |
130 | #define VINUM_SD(s) \ | |
d736a600 | 131 | ((VINUM_RAWSD_TYPE << VINUM_TYPE_SHIFT) \ |
e4c9c0c8 | 132 | | (s & 0xff) \ |
d736a600 | 133 | | ((s & ~0xff) << 8)) |
e4c9c0c8 MD |
134 | |
135 | #endif | |
984263bc MD |
136 | |
137 | /* Create a bit mask for x bits */ | |
138 | #define MASK(x) ((1 << (x)) - 1) | |
139 | ||
140 | /* Create a raw block device minor number */ | |
7c8f38d4 MD |
141 | #define VINUMRMINOR(d,t) \ |
142 | ( ((d & MASK(VINUM_VOL_WIDTH)) << VINUM_VOL_SHIFT) \ | |
143 | | ((d & ~MASK(VINUM_VOL_WIDTH)) << \ | |
144 | (VINUM_PLEX_SHIFT + VINUM_VOL_WIDTH)) \ | |
145 | | (t << VINUM_TYPE_SHIFT) ) | |
984263bc | 146 | |
984263bc MD |
147 | /* extract device type */ |
148 | #define DEVTYPE(x) ((minor (x) >> VINUM_TYPE_SHIFT) & 7) | |
149 | ||
150 | /* | |
151 | * This mess is used to catch people who compile | |
152 | * a debug vinum(8) and non-debug kernel module, | |
153 | * or the other way round. | |
154 | */ | |
155 | ||
156 | #ifdef VINUMDEBUG | |
7c8f38d4 MD |
157 | |
158 | /* superdevice number */ | |
159 | #define VINUM_SUPERDEV VINUMMINOR(1, 0, 0, VINUM_SUPERDEV_TYPE) | |
160 | ||
161 | /* non-debug superdevice number */ | |
162 | #define VINUM_WRONGSUPERDEV VINUMMINOR(2, 0, 0, VINUM_SUPERDEV_TYPE) | |
163 | ||
984263bc | 164 | #else |
984263bc | 165 | |
7c8f38d4 MD |
166 | /* superdevice number */ |
167 | #define VINUM_SUPERDEV VINUMMINOR(2, 0, 0, VINUM_SUPERDEV_TYPE) | |
984263bc | 168 | |
7c8f38d4 MD |
169 | /* debug superdevice number */ |
170 | #define VINUM_WRONGSUPERDEV VINUMMINOR(1, 0, 0, VINUM_SUPERDEV_TYPE) | |
984263bc | 171 | |
7c8f38d4 MD |
172 | #endif |
173 | ||
174 | /* daemon superdevice number */ | |
175 | #define VINUM_DAEMON_DEV VINUMMINOR(0, 0, 0, VINUM_SUPERDEV_TYPE) | |
176 | ||
177 | /* | |
178 | * the number of object entries to cater for initially, and also the | |
179 | * value by which they are incremented. It doesn't take long | |
180 | * to extend them, so theoretically we could start with 1 of each, but | |
181 | * it's untidy to allocate such small areas. These values are | |
182 | * probably too small. | |
183 | */ | |
184 | ||
185 | INITIAL_DRIVES = 4, | |
186 | INITIAL_VOLUMES = 4, | |
187 | INITIAL_PLEXES = 8, | |
188 | INITIAL_SUBDISKS = 16, | |
189 | INITIAL_SUBDISKS_IN_PLEX = 4, /* num subdisks to alloc to a plex */ | |
190 | INITIAL_SUBDISKS_IN_DRIVE = 4, /* num subdisks to alloc to a drive */ | |
191 | INITIAL_DRIVE_FREELIST = 16, /* num entries in drive freelist */ | |
192 | PLEX_REGION_TABLE_SIZE = 8, /* num entries in plex region tables */ | |
193 | PLEX_LOCKS = 256, /* num locks to alloc to a plex */ | |
194 | MAX_REVIVE_BLOCKSIZE = MAXPHYS, /* maximum revive block size */ | |
195 | DEFAULT_REVIVE_BLOCKSIZE = 65536,/* default revive block size */ | |
196 | VINUMHOSTNAMELEN = 32, /* host name field in label */ | |
984263bc MD |
197 | }; |
198 | ||
199 | /* device numbers */ | |
200 | ||
201 | /* | |
7c8f38d4 MD |
202 | * 31 30 28 27 20 19 18 16 15 8 7 0 |
203 | * |---------------------------------------------------------------------| | |
204 | * |X | Type | Subdisk number | X| Plex | Major | volno | | |
205 | * |---------------------------------------------------------------------| | |
984263bc | 206 | * |
7c8f38d4 | 207 | * 0x2 03 1 19 06 |
984263bc MD |
208 | * |
209 | * The fields in the minor number are interpreted as follows: | |
210 | * | |
211 | * Volume: Only type and volume number are relevant | |
7c8f38d4 MD |
212 | * Plex in volume: type, plex number in volume and volume number |
213 | * are relevant | |
984263bc MD |
214 | * raw plex: type, plex number is made of bits 27-16 and 7-0 |
215 | * raw subdisk: type, subdisk number is made of bits 27-16 and 7-0 | |
216 | */ | |
217 | ||
7c8f38d4 MD |
218 | #if 0 |
219 | ||
984263bc MD |
220 | /* This doesn't get used. Consider removing it. */ |
221 | struct devcode { | |
7c8f38d4 MD |
222 | /* |
223 | * CARE. These fields assume a big-endian word. On a | |
224 | * little-endian system, they're the wrong way around | |
225 | */ | |
226 | unsigned volume:8; /* up to 256 volumes */ | |
227 | unsigned major:8; /* major number fits */ | |
228 | unsigned plex:3; /* up to 8 plexes per volume */ | |
229 | unsigned unused:1; /* up for grabs */ | |
230 | unsigned sd:8; /* up to 256 subdisks per plex */ | |
231 | unsigned type:3; /* type of object */ | |
232 | /* | |
233 | * type field | |
234 | VINUM_VOLUME = 0, | |
235 | VINUM_PLEX = 1, | |
236 | VINUM_SUBDISK = 2, | |
237 | VINUM_DRIVE = 3, | |
238 | VINUM_SUPERDEV = 4, | |
239 | VINUM_RAWPLEX = 5, | |
240 | VINUM_RAWSD = 6 */ | |
241 | unsigned signbit:1; /* to make 32 bits */ | |
984263bc MD |
242 | }; |
243 | ||
7c8f38d4 MD |
244 | #endif |
245 | ||
d736a600 | 246 | #define VINUM_BASE "vinum/" |
984263bc MD |
247 | #define VINUM_DIR "/dev/vinum" |
248 | ||
249 | /* | |
250 | * These definitions help catch | |
251 | * userland/kernel mismatches. | |
252 | */ | |
0dcae153 | 253 | #ifdef VINUMDEBUG |
7c8f38d4 MD |
254 | |
255 | /* normal super device */ | |
d736a600 MD |
256 | #define VINUM_WRONGSUPERDEV_NAME VINUM_DIR "/control" |
257 | #define VINUM_WRONGSUPERDEV_BASE VINUM_BASE "control" | |
7c8f38d4 MD |
258 | |
259 | /* debug super device */ | |
d736a600 MD |
260 | #define VINUM_SUPERDEV_NAME VINUM_DIR "/Control" |
261 | #define VINUM_SUPERDEV_BASE VINUM_BASE "Control" | |
7c8f38d4 | 262 | |
984263bc | 263 | #else |
7c8f38d4 MD |
264 | |
265 | /* debug super device */ | |
d736a600 MD |
266 | #define VINUM_WRONGSUPERDEV_NAME VINUM_DIR "/Control" |
267 | #define VINUM_WRONGSUPERDEV_BASE VINUM_BASE "Control" | |
7c8f38d4 MD |
268 | |
269 | /* normal super device */ | |
d736a600 MD |
270 | #define VINUM_SUPERDEV_NAME VINUM_DIR "/control" |
271 | #define VINUM_SUPERDEV_BASE VINUM_BASE "control" | |
7c8f38d4 | 272 | |
984263bc | 273 | #endif |
7c8f38d4 MD |
274 | |
275 | /* super device for daemon only */ | |
d736a600 MD |
276 | #define VINUM_DAEMON_DEV_NAME VINUM_DIR "/controld" |
277 | #define VINUM_DAEMON_DEV_BASE VINUM_BASE "controld" | |
984263bc MD |
278 | |
279 | /* | |
280 | * Flags for all objects. Most of them only apply to | |
281 | * specific objects, but we have space for all in any | |
282 | * 32 bit flags word. | |
283 | */ | |
284 | enum objflags { | |
7c8f38d4 MD |
285 | VF_LOCKED = 1, /* locked access to this object */ |
286 | VF_LOCKING = 2, /* we want access to this object */ | |
287 | VF_OPEN = 4, /* object has openers */ | |
288 | VF_WRITETHROUGH = 8, /* volume: write through */ | |
289 | VF_INITED = 0x10, /* unit has been initialized */ | |
290 | ||
291 | /* 0x20 unused, was: VF_WLABEL: label area is writable */ | |
292 | VF_LABELLING = 0x40, /* unit is currently being labelled */ | |
293 | VF_WANTED = 0x80, /* waiting to obtain a lock */ | |
294 | VF_RAW = 0x100, /* raw volume (no file system) */ | |
295 | VF_LOADED = 0x200, /* module is loaded */ | |
296 | VF_CONFIGURING = 0x400, /* someone is changing the config */ | |
297 | VF_WILL_CONFIGURE = 0x800, /* someone wants to change the config */ | |
298 | VF_CONFIG_INCOMPLETE = 0x1000, /* not finished changing the config */ | |
299 | VF_CONFIG_SETUPSTATE = 0x2000, /* set a vol up if all plexes empty */ | |
300 | VF_READING_CONFIG = 0x4000, /* reading config database from disk */ | |
301 | VF_FORCECONFIG = 0x8000, /* config drives even with diff names */ | |
302 | VF_NEWBORN = 0x10000, /* for objects: we've just created it */ | |
303 | VF_CONFIGURED = 0x20000, /* for drives: we read the config */ | |
304 | VF_STOPPING = 0x40000, /* for vinum_conf: stop on last close */ | |
305 | ||
306 | VF_DAEMONOPEN = 0x80000, /* the daemon has us open (only | |
307 | * superdev) */ | |
308 | ||
309 | VF_CREATED = 0x100000, /* for vols: freshly created, | |
310 | * more then new */ | |
311 | VF_HOTSPARE = 0x200000, /* for drives: use as hot spare */ | |
312 | VF_RETRYERRORS = 0x400000, /* don't down subdisks on I/O errors */ | |
984263bc MD |
313 | }; |
314 | ||
315 | /* Global configuration information for the vinum subsystem */ | |
316 | struct _vinum_conf { | |
7c8f38d4 MD |
317 | /* Pointers to vinum structures */ |
318 | struct drive *drive; | |
319 | struct sd *sd; | |
320 | struct plex *plex; | |
321 | struct volume *volume; | |
322 | ||
323 | /* the number allocated */ | |
324 | int drives_allocated; | |
325 | int subdisks_allocated; | |
326 | int plexes_allocated; | |
327 | int volumes_allocated; | |
328 | ||
329 | /* and the number currently in use */ | |
330 | int drives_used; | |
331 | int subdisks_used; | |
332 | int plexes_used; | |
333 | int volumes_used; | |
334 | ||
335 | int flags; | |
336 | ||
337 | #define VINUM_MAXACTIVE 30000 /* max number of active requests */ | |
338 | int active; /* current number of requests outstanding */ | |
339 | int maxactive; /* max number of requests ever outstanding */ | |
0dcae153 | 340 | #ifdef VINUMDEBUG |
7c8f38d4 MD |
341 | struct request *lastrq; |
342 | struct bio *lastbio; | |
984263bc | 343 | #endif |
7c8f38d4 | 344 | int physbufs; |
984263bc MD |
345 | }; |
346 | ||
347 | /* Use these defines to simplify code */ | |
348 | #define DRIVE vinum_conf.drive | |
349 | #define SD vinum_conf.sd | |
350 | #define PLEX vinum_conf.plex | |
351 | #define VOL vinum_conf.volume | |
352 | #define VFLAGS vinum_conf.flags | |
353 | ||
354 | /* | |
355 | * Slice header | |
356 | * | |
357 | * Vinum drives start with this structure: | |
358 | * | |
359 | *\ Sector | |
360 | * |--------------------------------------| | |
361 | * | PDP-11 memorial boot block | 0 | |
362 | * |--------------------------------------| | |
363 | * | Disk label, maybe | 1 | |
364 | * |--------------------------------------| | |
365 | * | Slice definition (vinum_hdr) | 8 | |
366 | * |--------------------------------------| | |
367 | * | | | |
368 | * | Configuration info, first copy | 9 | |
369 | * | | | |
370 | * |--------------------------------------| | |
371 | * | | | |
372 | * | Configuration info, second copy | 9 + size of config | |
373 | * | | | |
374 | * |--------------------------------------| | |
375 | */ | |
376 | ||
7c8f38d4 MD |
377 | /* |
378 | * Sizes and offsets of our information | |
379 | */ | |
984263bc | 380 | enum { |
7c8f38d4 MD |
381 | VINUM_LABEL_OFFSET = 4096, /* offset of vinum label */ |
382 | VINUMHEADERLEN = 512, /* size of vinum label */ | |
383 | VINUM_CONFIG_OFFSET = 4608, /* offset of first config copy */ | |
384 | MAXCONFIG = 65536, /* and size of config copy */ | |
385 | ||
386 | /* this is where the data starts */ | |
387 | DATASTART = (MAXCONFIG * 2 + VINUM_CONFIG_OFFSET) / DEV_BSIZE | |
984263bc MD |
388 | }; |
389 | ||
390 | /* | |
391 | * hostname is 256 bytes long, but we don't need to shlep | |
392 | * multiple copies in vinum. We use the host name just | |
393 | * to identify this system, and 32 bytes should be ample | |
394 | * for that purpose | |
395 | */ | |
396 | ||
397 | struct vinum_label { | |
7c8f38d4 MD |
398 | char sysname[VINUMHOSTNAMELEN]; /* system name at time of creation */ |
399 | char name[MAXDRIVENAME]; /* our name of the drive */ | |
400 | struct timeval date_of_birth; /* the time it was created */ | |
401 | struct timeval last_update; /* and the time of last update */ | |
402 | /* | |
403 | * total size in bytes of the drive. This value | |
404 | * includes the headers. | |
405 | */ | |
406 | off_t drive_size; | |
984263bc MD |
407 | }; |
408 | ||
409 | struct vinum_hdr { | |
7c8f38d4 MD |
410 | uint64_t magic; /* we're long on magic numbers */ |
411 | ||
412 | /* | |
413 | * Size in bytes of each copy of the | |
414 | * configuration info. This must be a multiple | |
415 | * of the sector size. | |
416 | */ | |
417 | int config_length; | |
418 | struct vinum_label label; /* unique label */ | |
984263bc MD |
419 | }; |
420 | ||
7c8f38d4 MD |
421 | /* should be this */ |
422 | #define VINUM_MAGIC 22322600044678729LL | |
423 | ||
424 | /* becomes this after obliteration */ | |
425 | #define VINUM_NOMAGIC 22322600044678990LL | |
426 | ||
984263bc MD |
427 | /* Information returned from read_drive_label */ |
428 | enum drive_label_info { | |
7c8f38d4 MD |
429 | DL_CANT_OPEN, /* invalid partition */ |
430 | DL_NOT_OURS, /* valid part, but no vinum label */ | |
431 | DL_DELETED_LABEL, /* valid part, deleted label found */ | |
432 | DL_WRONG_DRIVE, /* drive name doesn't match */ | |
433 | DL_OURS /* valid partition and label found */ | |
984263bc MD |
434 | }; |
435 | ||
436 | /*** Drive definitions ***/ | |
437 | /* | |
438 | * A drive corresponds to a disk slice. We use a different term to show | |
439 | * the difference in usage: it doesn't have to be a slice, and could | |
440 | * theoretically be a complete, unpartitioned disk | |
441 | */ | |
442 | ||
443 | struct drive { | |
7c8f38d4 MD |
444 | char devicename[MAXDRIVENAME]; /* name of the slice it's on */ |
445 | enum drivestate state; /* current state */ | |
446 | int flags; /* flags */ | |
447 | int subdisks_allocated; /* number of entries in sd */ | |
448 | int subdisks_used; /* and the number used */ | |
449 | int blocksize; /* size of fs blocks */ | |
450 | int pid; /* of locker */ | |
451 | u_int64_t sectors_available; /* number of sectors still available */ | |
452 | int secsperblock; | |
453 | int lasterror; /* last error on drive */ | |
454 | int driveno; /* index of drive in vinum_conf */ | |
455 | int opencount; /* number of up subdisks */ | |
456 | u_int64_t reads; /* number of reads on this drive */ | |
457 | u_int64_t writes; /* number of writes on this drive */ | |
458 | u_int64_t bytes_read; /* number of bytes read */ | |
459 | u_int64_t bytes_written; /* number of bytes written */ | |
460 | struct vinum_label label; /* and the label information */ | |
461 | #define DRIVE_MAXACTIVE 30000 /* maximum number of active requests */ | |
462 | int active; /* current number of reqs outstanding */ | |
463 | int maxactive; /* max num of reqs ever outstanding */ | |
464 | int freelist_size; /* entries alloced in free list */ | |
465 | int freelist_entries; /* entries used in free list */ | |
466 | struct drive_freelist { /* sorted list of free space on drive */ | |
467 | u_int64_t offset; /* offset of entry */ | |
468 | u_int64_t sectors; /* and length in sectors */ | |
469 | } *freelist; | |
470 | struct partinfo partinfo; /* partition information */ | |
471 | /* XXX kludge until we get this struct cleaned up */ | |
dcd1a9c7 | 472 | #ifdef _KERNEL |
7c8f38d4 MD |
473 | struct vnode *vp; |
474 | struct cdev *dev; | |
984263bc | 475 | #else |
d736a600 MD |
476 | void *vp_dummy; |
477 | void *dev_dummy; | |
984263bc MD |
478 | #endif |
479 | #ifdef VINUMDEBUG | |
7c8f38d4 MD |
480 | char lockfilename[16]; /* locked with file */ |
481 | int lockline; /* and the line number */ | |
984263bc MD |
482 | #endif |
483 | }; | |
484 | ||
485 | /*** Subdisk definitions ***/ | |
486 | ||
487 | struct sd { | |
7c8f38d4 MD |
488 | char name[MAXSDNAME]; /* name of subdisk */ |
489 | enum sdstate state; /* state */ | |
490 | int flags; | |
491 | int lasterror; /* last error occurred */ | |
492 | /* offsets in blocks */ | |
493 | int64_t driveoffset; /* offset on drive */ | |
d736a600 MD |
494 | #ifdef _KERNEL |
495 | cdev_t sd_dev; | |
496 | #else | |
497 | void *sd_dev_dummy; | |
498 | #endif | |
7c8f38d4 MD |
499 | |
500 | /* | |
501 | * plexoffset is the offset from the beginning | |
502 | * of the plex to the very first part of the | |
503 | * subdisk, in sectors. For striped, RAID-4 and | |
504 | * RAID-5 plexes, only the first stripe is | |
505 | * located at this offset | |
506 | */ | |
507 | int64_t plexoffset; /* offset in plex */ | |
508 | u_int64_t sectors; /* and length in sectors */ | |
509 | int plexno; /* index of plex, if it belongs */ | |
510 | int driveno; /* index of the drive */ | |
511 | int sdno; /* our index in vinum_conf */ | |
512 | int plexsdno; /* and our number in our plex */ | |
513 | /* (undefined if no plex) */ | |
514 | u_int64_t reads; /* number of reads on this subdisk */ | |
515 | u_int64_t writes; /* number of writes on this subdisk */ | |
516 | u_int64_t bytes_read; /* number of bytes read */ | |
517 | u_int64_t bytes_written; /* number of bytes written */ | |
518 | /* revive parameters */ | |
519 | u_int64_t revived; /* blkno of current revive request */ | |
520 | int revive_blocksize; /* revive block size (bytes) */ | |
521 | int revive_interval; /* and time to wait between transfers */ | |
522 | pid_t reviver; /* PID of reviving process */ | |
523 | /* init parameters */ | |
524 | u_int64_t initialized; /* blkno of current init request */ | |
525 | int init_blocksize; /* init block size (bytes) */ | |
526 | int init_interval; /* time to wait between transfers */ | |
527 | struct request *waitlist; /* list of reqs waiting on revive op */ | |
984263bc MD |
528 | }; |
529 | ||
530 | /*** Plex definitions ***/ | |
531 | ||
532 | /* kinds of plex organization */ | |
533 | enum plexorg { | |
7c8f38d4 MD |
534 | plex_disorg, /* disorganized */ |
535 | plex_concat, /* concatenated plex */ | |
536 | plex_striped, /* striped plex */ | |
537 | plex_raid4, /* RAID4 plex */ | |
538 | plex_raid5 /* RAID5 plex */ | |
984263bc MD |
539 | }; |
540 | ||
541 | /* Recognize plex organizations */ | |
7c8f38d4 MD |
542 | /* RAID 1, 4 or 5 */ |
543 | #define isstriped(p) (p->organization >= plex_striped) | |
544 | ||
545 | /* RAID 4 or 5 */ | |
546 | #define isparity(p) (p->organization >= plex_raid4) | |
984263bc MD |
547 | |
548 | struct plex { | |
7c8f38d4 MD |
549 | char name[MAXPLEXNAME]; /* name of plex */ |
550 | enum plexorg organization; /* Plex organization */ | |
551 | enum plexstate state; /* and current state */ | |
d736a600 MD |
552 | #ifdef _KERNEL |
553 | cdev_t plex_dev; | |
554 | #else | |
555 | void *plex_dev_dummy; | |
556 | #endif | |
7c8f38d4 MD |
557 | u_int64_t length; /* total length of plex (sectors) */ |
558 | int flags; | |
559 | int stripesize; /* size of stripe or raid band, | |
560 | * in sectors */ | |
561 | int subdisks; /* number of associated subdisks */ | |
562 | int subdisks_allocated; /* number of subdisks allocated | |
563 | * space for */ | |
564 | int *sdnos; /* list of component subdisks */ | |
565 | int plexno; /* index of plex in vinum_conf */ | |
566 | int volno; /* index of volume */ | |
567 | int volplexno; /* number of plex in volume */ | |
568 | /* Statistics */ | |
569 | u_int64_t reads; /* number of reads on this plex */ | |
570 | u_int64_t writes; /* number of writes on this plex */ | |
571 | u_int64_t bytes_read; /* number of bytes read */ | |
572 | u_int64_t bytes_written; /* number of bytes written */ | |
573 | u_int64_t recovered_reads; /* number of recovered read | |
574 | * operations */ | |
575 | u_int64_t degraded_writes; /* number of degraded writes */ | |
576 | u_int64_t parityless_writes; /* number of parityless writes */ | |
577 | u_int64_t multiblock; /* requests that needed more than | |
578 | * one block */ | |
579 | u_int64_t multistripe; /* requests that needed more than | |
580 | * one stripe */ | |
581 | int sddowncount; /* number of subdisks down */ | |
582 | ||
583 | /* Lock information */ | |
584 | int usedlocks; /* number currently in use */ | |
585 | int lockwaits; /* and number of waits for locks */ | |
586 | off_t checkblock; /* block number for parity op */ | |
587 | struct rangelock *lock; /* ranges of locked addresses */ | |
984263bc MD |
588 | }; |
589 | ||
590 | /*** Volume definitions ***/ | |
591 | ||
592 | /* Address range definitions, for locking volumes */ | |
593 | struct rangelock { | |
c2fcd54e | 594 | vinum_off_t stripe; /* address + 1 of the range being locked */ |
7c8f38d4 | 595 | struct buf *bp; /* user's buffer pointer */ |
984263bc MD |
596 | }; |
597 | ||
598 | struct volume { | |
7c8f38d4 MD |
599 | char name[MAXVOLNAME]; /* name of volume */ |
600 | enum volumestate state; /* current state */ | |
601 | int plexes; /* number of plexes */ | |
602 | int preferred_plex; /* plex to read from, -1 for | |
603 | * round-robin */ | |
d736a600 MD |
604 | #ifdef _KERNEL |
605 | cdev_t vol_dev; | |
606 | #else | |
607 | void *vol_dev_dummy; | |
608 | #endif | |
609 | ||
7c8f38d4 MD |
610 | /* |
611 | * index of plex used for last read, for | |
612 | * round-robin. | |
613 | */ | |
614 | int last_plex_read; | |
615 | int volno; /* volume number */ | |
616 | int flags; /* status and configuration flags */ | |
617 | int openflags; /* flags supplied to last open(2) */ | |
618 | u_int64_t size; /* size of volume */ | |
619 | int blocksize; /* logical block size */ | |
620 | int active; /* number of outstanding | |
621 | * requests active */ | |
622 | int subops; /* and the number of suboperations */ | |
623 | /* Statistics */ | |
624 | u_int64_t bytes_read; /* number of bytes read */ | |
625 | u_int64_t bytes_written; /* number of bytes written */ | |
626 | u_int64_t reads; /* number of reads on this volume */ | |
627 | u_int64_t writes; /* number of writes on this volume */ | |
628 | u_int64_t recovered_reads; /* reads recovered from another plex */ | |
629 | ||
630 | /* | |
631 | * Unlike subdisks in the plex, space for the | |
632 | * plex pointers is static. | |
633 | */ | |
634 | int plex[MAXPLEX]; /* index of plexes */ | |
984263bc MD |
635 | }; |
636 | ||
637 | /* | |
638 | * Table expansion. Expand table, which contains oldcount | |
639 | * entries of type element, by increment entries, and change | |
640 | * oldcount accordingly | |
641 | */ | |
642 | #define EXPAND(table, element, oldcount, increment) \ | |
643 | { \ | |
7c8f38d4 MD |
644 | expand_table((void **) &table, \ |
645 | oldcount * sizeof (element), \ | |
646 | (oldcount + increment) * sizeof (element)); \ | |
647 | oldcount += increment; \ | |
648 | } | |
984263bc | 649 | |
7c8f38d4 MD |
650 | /* |
651 | * Information on vinum's memory usage | |
652 | */ | |
984263bc | 653 | struct meminfo { |
7c8f38d4 MD |
654 | int mallocs; /* number of malloced blocks */ |
655 | int total_malloced; /* total amount malloced */ | |
656 | int highwater; /* maximum number of mallocs */ | |
657 | struct mc *malloced; /* pointer to kernel table */ | |
984263bc MD |
658 | }; |
659 | ||
660 | #define MCFILENAMELEN 16 | |
661 | struct mc { | |
7c8f38d4 MD |
662 | struct timeval time; |
663 | int seq; | |
664 | int size; | |
665 | short line; | |
666 | caddr_t address; | |
667 | char file[MCFILENAMELEN]; | |
984263bc MD |
668 | }; |
669 | ||
670 | /* | |
671 | * These enums are used by the state transition | |
672 | * routines. They're in bit map format: | |
673 | * | |
674 | * Bit 0: Other plexes in the volume are down | |
675 | * Bit 1: Other plexes in the volume are up | |
676 | * Bit 2: The current plex is up | |
677 | * Maybe they should be local to | |
678 | * state.c | |
679 | */ | |
680 | enum volplexstate { | |
7c8f38d4 MD |
681 | volplex_onlyusdown = 0, /* 0: we're the only plex, |
682 | * and we're down */ | |
683 | volplex_alldown, /* 1: another plex is down, | |
684 | * and so are we */ | |
685 | volplex_otherup, /* 2: another plex is up */ | |
686 | volplex_otherupdown, /* 3: other plexes are up and down */ | |
687 | volplex_onlyus, /* 4: we're up and alone */ | |
688 | volplex_onlyusup, /* 5: only we are up, others are down */ | |
689 | volplex_allup, /* 6: all plexes are up */ | |
690 | volplex_someup /* 7: some plexes are up, | |
691 | * including us */ | |
984263bc MD |
692 | }; |
693 | ||
694 | /* state map for plex */ | |
695 | enum sdstates { | |
7c8f38d4 MD |
696 | sd_emptystate = 1, |
697 | sd_downstate = 2, /* SD is down */ | |
698 | sd_crashedstate = 4, /* SD is crashed */ | |
699 | sd_obsoletestate = 8, /* SD is obsolete */ | |
700 | sd_stalestate = 16, /* SD is stale */ | |
701 | sd_rebornstate = 32, /* SD is reborn */ | |
702 | sd_upstate = 64, /* SD is up */ | |
703 | sd_initstate = 128, /* SD is initializing */ | |
704 | sd_initializedstate = 256, /* SD is initialized */ | |
705 | sd_otherstate = 512, /* SD is in some other state */ | |
984263bc MD |
706 | }; |
707 | ||
708 | /* | |
709 | * This is really just a parameter to pass to | |
710 | * set_<foo>_state, but since it needs to be known | |
711 | * in the external definitions, we need to define | |
712 | * it here | |
713 | */ | |
714 | enum setstateflags { | |
7c8f38d4 MD |
715 | setstate_none = 0, /* no flags */ |
716 | setstate_force = 1, /* force the state change */ | |
717 | setstate_configuring = 2, /* we're currently configuring, | |
718 | don't save */ | |
984263bc MD |
719 | }; |
720 | ||
721 | /* Operations for parityops to perform. */ | |
722 | enum parityop { | |
7c8f38d4 MD |
723 | checkparity, |
724 | rebuildparity, | |
725 | rebuildandcheckparity, /* rebuildparity with the -v option */ | |
984263bc MD |
726 | }; |
727 | ||
728 | #ifdef VINUMDEBUG | |
7c8f38d4 MD |
729 | |
730 | /* | |
731 | * Debugging stuff | |
732 | */ | |
984263bc | 733 | enum debugflags { |
7c8f38d4 MD |
734 | DEBUG_ADDRESSES = 1, /* show buffer information during |
735 | * requests */ | |
736 | DEBUG_NUMOUTPUT = 2, /* show the value of vp->v_numoutput */ | |
737 | DEBUG_RESID = 4, /* go into debugger in complete_rqe */ | |
738 | DEBUG_LASTREQS = 8, /* keep a circular buffer of | |
739 | * last requests */ | |
740 | DEBUG_REVIVECONFLICT = 16, /* print info about revive conflicts */ | |
741 | DEBUG_EOFINFO = 32, /* print info about EOF detection */ | |
742 | DEBUG_MEMFREE = 64, /* keep info about Frees */ | |
743 | DEBUG_BIGDRIVE = 128, /* pretend our drives are 100 times | |
744 | * the size */ | |
745 | DEBUG_REMOTEGDB = 256, /* go into remote gdb */ | |
746 | DEBUG_WARNINGS = 512, /* log various relatively | |
747 | * harmless warnings */ | |
984263bc MD |
748 | }; |
749 | ||
984263bc | 750 | #endif |