2 * Copyright (c) 1997, 1998, 1999
3 * Nan Yang Computer Services Limited. All rights reserved.
5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
7 * Written by Greg Lehey
9 * This software is distributed under the so-called ``Berkeley
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by Nan Yang Computer
24 * 4. Neither the name of the Company nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
28 * This software is provided ``as is'', and any express or implied
29 * warranties, including, but not limited to, the implied warranties of
30 * merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall the company or contributors be liable for any
32 * direct, indirect, incidental, special, exemplary, or consequential
33 * damages (including, but not limited to, procurement of substitute
34 * goods or services; loss of use, data, or profits; or business
35 * interruption) however caused and on any theory of liability, whether
36 * in contract, strict liability, or tort (including negligence or
37 * otherwise) arising in any way out of the use of this software, even if
38 * advised of the possibility of such damage.
40 * $Id: vinumrevive.c,v 1.14 2000/12/21 01:55:11 grog Exp grog $
41 * $FreeBSD: src/sys/dev/vinum/vinumrevive.c,v 1.22.2.5 2001/03/13 02:59:43 grog Exp $
42 * $DragonFly: src/sys/dev/raid/vinum/vinumrevive.c,v 1.8 2006/03/26 07:56:54 swildner Exp $
49 * Revive a block of a subdisk. Return an error
50 * indication. EAGAIN means successful copy, but
51 * that more blocks remain to be copied. EINVAL
52 * means that the subdisk isn't associated with a
53 * plex (which means a programming error if we get
54 * here at all; FIXME).
58 revive_block(int sdno)
66 int size; /* size of revive block, bytes */
67 daddr_t plexblkno; /* lblkno in plex */
68 int psd; /* parity subdisk number */
69 u_int64_t stripe; /* stripe number */
70 int paritysd = 0; /* set if this is the parity stripe */
71 struct rangelock *lock; /* for locking */
72 daddr_t stripeoffset; /* offset in stripe */
74 plexblkno = 0; /* to keep the compiler happy */
77 if (sd->plexno < 0) /* no plex? */
79 plex = &PLEX[sd->plexno]; /* point to plex */
81 vol = &VOL[plex->volno];
85 if ((sd->revive_blocksize == 0) /* no block size */
86 ||(sd->revive_blocksize & ((1 << DEV_BSHIFT) - 1))) /* or invalid block size */
87 sd->revive_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
88 else if (sd->revive_blocksize > MAX_REVIVE_BLOCKSIZE)
89 sd->revive_blocksize = MAX_REVIVE_BLOCKSIZE;
90 size = min(sd->revive_blocksize >> DEV_BSHIFT, sd->sectors - sd->revived) << DEV_BSHIFT;
91 sd->reviver = curproc->p_pid; /* note who last had a bash at it */
93 /* Now decide where to read from */
94 switch (plex->organization) {
96 plexblkno = sd->revived + sd->plexoffset; /* corresponding address in plex */
100 stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */
101 if (stripeoffset + (size >> DEV_BSHIFT) > plex->stripesize)
102 size = (plex->stripesize - stripeoffset) << DEV_BSHIFT;
103 plexblkno = sd->plexoffset /* base */
104 + (sd->revived - stripeoffset) * plex->subdisks /* offset to beginning of stripe */
105 + stripeoffset; /* offset from beginning of stripe */
110 stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */
111 plexblkno = sd->plexoffset /* base */
112 + (sd->revived - stripeoffset) * (plex->subdisks - 1) /* offset to beginning of stripe */
113 +stripeoffset; /* offset from beginning of stripe */
114 stripe = (sd->revived / plex->stripesize); /* stripe number */
116 /* Make sure we don't go beyond the end of the band. */
117 size = min(size, (plex->stripesize - stripeoffset) << DEV_BSHIFT);
118 if (plex->organization == plex_raid4)
119 psd = plex->subdisks - 1; /* parity subdisk for this stripe */
121 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
122 paritysd = plex->sdnos[psd] == sdno; /* note if it's the parity subdisk */
125 * Now adjust for the strangenesses
126 * in RAID-4 and RAID-5 striping.
128 if (sd->plexsdno > psd) /* beyond the parity stripe, */
129 plexblkno -= plex->stripesize; /* one stripe less */
131 plexblkno -= plex->stripesize * sd->plexsdno; /* go back to the beginning of the band */
134 case plex_disorg: /* to keep the compiler happy */
138 if (paritysd) { /* we're reviving a parity block, */
139 bp = parityrebuild(plex, sd->revived, size, rebuildparity, &lock, NULL); /* do the grunt work */
140 if (bp == NULL) /* no buffer space */
141 return ENOMEM; /* chicken out */
142 } else { /* data block */
144 bp = geteblk(size); /* Get a buffer */
150 * Amount to transfer: block size, unless it
151 * would overlap the end.
154 bp->b_resid = bp->b_bcount;
155 bp->b_bio1.bio_offset = (off_t)plexblkno << DEV_BSHIFT; /* start here */
156 if (isstriped(plex)) /* we need to lock striped plexes */
157 lock = lockrange(plexblkno << DEV_BSHIFT, bp, plex); /* lock it */
158 if (vol != NULL) /* it's part of a volume, */
160 * First, read the data from the volume. We
161 * don't care which plex, that's bre's job.
163 dev = VINUMDEV(plex->volno, 0, 0, VINUM_VOLUME_TYPE); /* create the device number */
164 else /* it's an unattached plex */
165 dev = VINUM_PLEX(sd->plexno); /* create the device number */
167 bp->b_flags = B_READ; /* either way, read it */
168 vinumstart(dev, &bp->b_bio1, 1);
172 if (bp->b_flags & B_ERROR)
175 /* Now write to the subdisk */
177 dev = VINUM_SD(sdno); /* create the device number */
178 bp->b_flags = B_ORDERED | B_WRITE; /* and make this an ordered write */
179 bp->b_resid = bp->b_bcount;
180 bp->b_bio1.bio_offset = (off_t)sd->revived << DEV_BSHIFT; /* write it to here */
181 bp->b_bio1.bio_driver_info = dev;
182 sdio(&bp->b_bio1); /* perform the I/O */
184 if (bp->b_flags & B_ERROR)
187 sd->revived += bp->b_bcount >> DEV_BSHIFT; /* moved this much further down */
188 if (sd->revived >= sd->sectors) { /* finished */
190 set_sd_state(sdno, sd_up, setstate_force); /* bring the sd up */
191 log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
192 save_config(); /* and save the updated configuration */
193 error = 0; /* we're done */
196 if (lock) /* we took a lock, */
197 unlockrange(sd->plexno, lock); /* give it back */
198 while (sd->waitlist) { /* we have waiting requests */
200 struct request *rq = sd->waitlist;
203 if (debug & DEBUG_REVIVECONFLICT) {
204 dev = rq->bio->bio_driver_info;
206 "Relaunch revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%llx, length %d\n",
209 rq->bio->bio_buf->b_flags & B_READ ? "Read" : "Write",
213 rq->bio->bio_buf->b_bcount);
216 launch_requests(sd->waitlist, 1); /* do them now */
217 sd->waitlist = sd->waitlist->next; /* and move on to the next */
220 if (bp->b_qindex == 0) { /* not on a queue, */
221 bp->b_flags |= B_INVAL;
222 bp->b_flags &= ~B_ERROR;
223 brelse(bp); /* is this kosher? */
229 * Check or rebuild the parity blocks of a RAID-4
232 * The variables plex->checkblock and
233 * plex->rebuildblock represent the
234 * subdisk-relative address of the stripe we're
235 * looking at, not the plex-relative address. We
236 * store it in the plex and not as a local
237 * variable because this function could be
238 * stopped, and we don't want to repeat the part
239 * we've already done. This is also the reason
240 * why we don't initialize it here except at the
241 * end. It gets initialized with the plex on
244 * Each call to this function processes at most
245 * one stripe. We can't loop in this function,
246 * because we're unstoppable, so we have to be
247 * called repeatedly from userland.
250 parityops(struct vinum_ioctl_msg *data)
254 int size; /* I/O transfer size, bytes */
255 int stripe; /* stripe number in plex */
256 int psd; /* parity subdisk number */
257 struct rangelock *lock; /* lock on stripe */
258 struct _ioctl_reply *reply;
259 off_t pstripe; /* pointer to our stripe counter */
261 off_t errorloc; /* offset of parity error */
262 enum parityop op; /* operation to perform */
264 plexno = data->index;
267 reply = (struct _ioctl_reply *) data;
268 reply->error = EAGAIN; /* expect to repeat this call */
269 plex = &PLEX[plexno];
270 if (!isparity(plex)) { /* not RAID-4 or RAID-5 */
271 reply->error = EINVAL;
273 } else if (plex->state < plex_flaky) {
275 strcpy(reply->msg, "Plex is not completely accessible\n");
278 pstripe = data->offset;
279 stripe = pstripe / plex->stripesize; /* stripe number */
280 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
281 size = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */
282 plex->stripesize << DEV_BSHIFT);
284 pbp = parityrebuild(plex, pstripe, size, op, &lock, &errorloc); /* do the grunt work */
285 if (pbp == NULL) { /* no buffer space */
286 reply->error = ENOMEM;
287 return; /* chicken out */
290 * Now we have a result in the data buffer of
291 * the parity buffer header, which we have kept.
292 * Decide what to do with it.
294 reply->msg[0] = '\0'; /* until shown otherwise */
295 if ((pbp->b_flags & B_ERROR) == 0) { /* no error */
296 if ((op == rebuildparity)
297 || (op == rebuildandcheckparity)) {
298 pbp->b_flags &= ~B_READ;
299 pbp->b_resid = pbp->b_bcount;
300 sdio(&pbp->b_bio1); /* write the parity block */
303 if (((op == checkparity)
304 || (op == rebuildandcheckparity))
305 && (errorloc != -1)) {
306 if (op == checkparity)
309 "Parity incorrect at offset 0x%llx\n",
312 if (reply->error == EAGAIN) { /* still OK, */
313 plex->checkblock = pstripe + (pbp->b_bcount >> DEV_BSHIFT); /* moved this much further down */
314 if (plex->checkblock >= SD[plex->sdnos[0]].sectors) { /* finished */
315 plex->checkblock = 0;
320 if (pbp->b_flags & B_ERROR)
321 reply->error = pbp->b_error;
322 pbp->b_flags |= B_INVAL;
323 pbp->b_flags &= ~B_ERROR;
325 unlockrange(plexno, lock);
329 * Rebuild a parity stripe. Return pointer to
330 * parity bp. On return,
332 * 1. The band is locked. The caller must unlock
333 * the band and release the buffer header.
335 * 2. All buffer headers except php have been
336 * released. The caller must release pbp.
338 * 3. For checkparity and rebuildandcheckparity,
339 * the parity is compared with the current
340 * parity block. If it's different, the
341 * offset of the error is returned to
342 * errorloc. The caller can set the value of
343 * the pointer to NULL if this is called for
346 * pstripe is the subdisk-relative base address of
347 * the data to be reconstructed, size is the size
348 * of the transfer in bytes.
351 parityrebuild(struct plex *plex,
355 struct rangelock **lockp,
360 u_int64_t stripe; /* stripe number */
361 int *parity_buf; /* buffer address for current parity block */
362 int *newparity_buf; /* and for new parity block */
363 int mysize; /* I/O transfer size for this transfer */
364 int isize; /* mysize in ints */
366 int psd; /* parity subdisk number */
367 int newpsd; /* and "subdisk number" of new parity */
368 struct buf **bpp; /* pointers to our bps */
369 struct buf *pbp; /* buffer header for parity stripe */
371 int bufcount; /* number of buffers we need */
373 stripe = pstripe / plex->stripesize; /* stripe number */
374 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
375 parity_buf = NULL; /* to keep the compiler happy */
379 * It's possible that the default transfer size
380 * we chose is not a factor of the stripe size.
381 * We *must* limit this operation to a single
382 * stripe, at least for RAID-5 rebuild, since
383 * the parity subdisk changes between stripes,
384 * so in this case we need to perform a short
385 * transfer. Set variable mysize to reflect
388 mysize = min(size, (plex->stripesize * (stripe + 1) - pstripe) << DEV_BSHIFT);
389 isize = mysize / (sizeof(int)); /* number of ints in the buffer */
390 bufcount = plex->subdisks + 1; /* sd buffers plus result buffer */
391 newpsd = plex->subdisks;
392 bpp = (struct buf **) Malloc(bufcount * sizeof(struct buf *)); /* array of pointers to bps */
394 /* First, build requests for all subdisks */
395 for (sdno = 0; sdno < bufcount; sdno++) { /* for each subdisk */
396 if ((sdno != psd) || (op != rebuildparity)) {
397 /* Get a buffer header and initialize it. */
399 bpp[sdno] = geteblk(mysize); /* Get a buffer */
400 if (bpp[sdno] == NULL) {
401 while (sdno-- > 0) { /* release the ones we got */
402 bpp[sdno]->b_flags |= B_INVAL;
403 brelse(bpp[sdno]); /* give back our resources */
406 printf("vinum: can't allocate buffer space for parity op.\n");
407 return NULL; /* no bpps */
411 parity_buf = (int *) bpp[sdno]->b_data;
412 if (sdno == newpsd) /* the new one? */
413 bpp[sdno]->b_bio1.bio_driver_info = VINUM_SD(plex->sdnos[psd]); /* write back to the parity SD */
415 bpp[sdno]->b_bio1.bio_driver_info = VINUM_SD(plex->sdnos[sdno]); /* device number */
416 bpp[sdno]->b_flags = B_READ; /* either way, read it */
417 bpp[sdno]->b_bcount = mysize;
418 bpp[sdno]->b_resid = bpp[sdno]->b_bcount;
419 bpp[sdno]->b_bio1.bio_offset = (off_t)pstripe << DEV_BSHIFT; /* transfer from here */
423 /* Initialize result buffer */
425 newparity_buf = (int *) bpp[newpsd]->b_data;
426 bzero(newparity_buf, mysize);
429 * Now lock the stripe with the first non-parity
432 *lockp = lockrange(pstripe * plex->stripesize * (plex->subdisks - 1),
437 * Then issue requests for all subdisks in
438 * parallel. Don't transfer the parity stripe
439 * if we're rebuilding parity, unless we also
442 for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each real subdisk */
443 if ((sdno != psd) || (op != rebuildparity)) {
444 sdio(&bpp[sdno]->b_bio1);
449 * Next, wait for the requests to complete.
450 * We wait in the order in which they were
451 * issued, which isn't necessarily the order in
452 * which they complete, but we don't have a
453 * convenient way of doing the latter, and the
456 for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each subdisk */
457 if ((sdno != psd) || (op != rebuildparity)) {
459 if (bpp[sdno]->b_flags & B_ERROR) /* can't read, */
460 error = bpp[sdno]->b_error;
461 else if (sdno != psd) { /* update parity */
462 sbuf = (int *) bpp[sdno]->b_data;
463 for (i = 0; i < isize; i++)
464 ((int *) newparity_buf)[i] ^= sbuf[i]; /* xor in the buffer */
467 if (sdno != psd) { /* release all bps except parity */
468 bpp[sdno]->b_flags |= B_INVAL;
469 brelse(bpp[sdno]); /* give back our resources */
474 * If we're checking, compare the calculated
475 * and the read parity block. If they're
476 * different, return the plex-relative offset;
477 * otherwise return -1.
479 if ((op == checkparity)
480 || (op == rebuildandcheckparity)) {
481 *errorloc = -1; /* no error yet */
482 for (i = 0; i < isize; i++) {
483 if (parity_buf[i] != newparity_buf[i]) {
484 *errorloc = (off_t) (pstripe << DEV_BSHIFT) * (plex->subdisks - 1)
489 bpp[psd]->b_flags |= B_INVAL;
490 brelse(bpp[psd]); /* give back our resources */
492 /* release our resources */
495 pbp->b_flags |= B_ERROR;
496 pbp->b_error = error;
502 * Initialize a subdisk by writing zeroes to the
503 * complete address space. If verify is set,
504 * check each transfer for correctness.
506 * Each call to this function writes (and maybe
507 * checks) a single block.
510 initsd(int sdno, int verify)
517 int size; /* size of init block, bytes */
518 daddr_t plexblkno; /* lblkno in plex */
519 int verified; /* set when we're happy with what we wrote */
522 plexblkno = 0; /* to keep the compiler happy */
524 if (sd->plexno < 0) /* no plex? */
526 plex = &PLEX[sd->plexno]; /* point to plex */
527 if (plex->volno >= 0)
528 vol = &VOL[plex->volno];
532 if (sd->init_blocksize == 0) {
533 if (plex->stripesize != 0) /* we're striped, don't init more than */
534 sd->init_blocksize = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */
535 plex->stripesize << DEV_BSHIFT);
537 sd->init_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
538 } else if (sd->init_blocksize > MAX_REVIVE_BLOCKSIZE)
539 sd->init_blocksize = MAX_REVIVE_BLOCKSIZE;
541 size = min(sd->init_blocksize >> DEV_BSHIFT, sd->sectors - sd->initialized) << DEV_BSHIFT;
544 while (!verified) { /* until we're happy with it, */
546 bp = geteblk(size); /* Get a buffer */
552 bp->b_resid = bp->b_bcount;
553 bp->b_bio1.bio_offset = (off_t)sd->initialized << DEV_BSHIFT; /* write it to here */
554 bp->b_bio1.bio_driver_info = VINUM_SD(sdno);
555 bzero(bp->b_data, bp->b_bcount);
556 bp->b_flags &= ~B_READ;
557 sdio(&bp->b_bio1); /* perform the I/O */
559 if (bp->b_flags & B_ERROR)
561 if (bp->b_qindex == 0) { /* not on a queue, */
562 bp->b_flags |= B_INVAL;
563 bp->b_flags &= ~B_ERROR;
564 brelse(bp); /* is this kosher? */
566 if ((error == 0) && verify) { /* check that it got there */
568 bp = geteblk(size); /* get a buffer */
574 bp->b_resid = bp->b_bcount;
575 bp->b_bio1.bio_offset = (off_t)sd->initialized << DEV_BSHIFT; /* read from here */
576 bp->b_bio1.bio_driver_info = VINUM_SD(sdno);
577 bp->b_flags |= B_READ; /* read it back */
582 * XXX Bug fix code. This is hopefully no
583 * longer needed (21 February 2000).
585 if (bp->b_flags & B_ERROR)
587 else if ((*bp->b_data != 0) /* first word spammed */
588 ||(bcmp(bp->b_data, &bp->b_data[1], bp->b_bcount - 1))) { /* or one of the others */
589 printf("vinum: init error on %s, offset 0x%llx sectors\n",
591 (long long) sd->initialized);
595 if (bp->b_qindex == 0) { /* not on a queue, */
596 bp->b_flags |= B_INVAL;
597 bp->b_flags &= ~B_ERROR;
598 brelse(bp); /* is this kosher? */
604 if (error == 0) { /* did it, */
605 sd->initialized += size >> DEV_BSHIFT; /* moved this much further down */
606 if (sd->initialized >= sd->sectors) { /* finished */
608 set_sd_state(sdno, sd_initialized, setstate_force); /* bring the sd up */
609 log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
610 save_config(); /* and save the updated configuration */
611 } else /* more to go, */
612 error = EAGAIN; /* ya'll come back, see? */
617 /* Local Variables: */
618 /* fill-column: 50 */