sys/dev/raid/vinum/vinumraid5.c

   1 /*-
   2  * Copyright (c) 1997, 1998
   3  *      Cybernet Corporation and Nan Yang Computer Services Limited.
   4  *      All rights reserved.
   5  *
   6  *  This software was developed as part of the NetMAX project.
   7  *
   8  *  Written by Greg Lehey
   9  *
  10  *  This software is distributed under the so-called ``Berkeley
  11  *  License'':
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  * 3. All advertising materials mentioning features or use of this software
  22  *    must display the following acknowledgement:
  23  *      This product includes software developed by Cybernet Corporation
  24  *      and Nan Yang Computer Services Limited
  25  * 4. Neither the name of the Companies nor the names of its contributors
  26  *    may be used to endorse or promote products derived from this software
  27  *    without specific prior written permission.
  28  *
  29  * This software is provided ``as is'', and any express or implied
  30  * warranties, including, but not limited to, the implied warranties of
  31  * merchantability and fitness for a particular purpose are disclaimed.
  32  * In no event shall the company or contributors be liable for any
  33  * direct, indirect, incidental, special, exemplary, or consequential
  34  * damages (including, but not limited to, procurement of substitute
  35  * goods or services; loss of use, data, or profits; or business
  36  * interruption) however caused and on any theory of liability, whether
  37  * in contract, strict liability, or tort (including negligence or
  38  * otherwise) arising in any way out of the use of this software, even if
  39  * advised of the possibility of such damage.
  40  *
  41  * $Id: vinumraid5.c,v 1.21 2001/01/09 04:21:27 grog Exp grog $
  42  * $FreeBSD: src/sys/dev/vinum/vinumraid5.c,v 1.6.2.2 2001/03/13 02:59:43 grog Exp $
  43  */
  44 #include "vinumhdr.h"
  45 #include "request.h"
  46 #include <sys/resourcevar.h>
  47
  48 /*
  49  * Parameters which describe the current transfer.
  50  * These are only used for calculation, but they
  51  * need to be passed to other functions, so it's
  52  * tidier to put them in a struct
  53  */
  54 struct metrics {
  55     vinum_off_t stripebase;                                         /* base address of stripe (1st subdisk) */
  56     int stripeoffset;                                       /* offset in stripe */
  57     int stripesectors;                                      /* total sectors to transfer in this stripe */
  58     vinum_off_t sdbase;                                     /* offset in subdisk of stripe base */
  59     int sdcount;                                            /* number of disks involved in this transfer */
  60     vinum_off_t diskstart;                                          /* remember where this transfer starts */
  61     int psdno;                                              /* number of parity subdisk */
  62     int badsdno;                                            /* number of down subdisk, if there is one */
  63     int firstsdno;                                          /* first data subdisk number */
  64     /* These correspond to the fields in rqelement, sort of */
  65     int useroffset;
  66     /*
  67      * Initial offset and length values for the first
  68      * data block
  69      */
  70     int initoffset;                                         /* start address of block to transfer */
  71     short initlen;                                          /* length in sectors of data transfer */
  72     /* Define a normal operation */
  73     int dataoffset;                                         /* start address of block to transfer */
  74     int datalen;                                            /* length in sectors of data transfer */
  75     /* Define a group operation */
  76     int groupoffset;                                        /* subdisk offset of group operation */
  77     int grouplen;                                           /* length in sectors of group operation */
  78     /* Define a normal write operation */
  79     int writeoffset;                                        /* subdisk offset of normal write */
  80     int writelen;                                           /* length in sectors of write operation */
  81     enum xferinfo flags;                                    /* to check what we're doing */
  82     int rqcount;                                            /* number of elements in request */
  83 };
  84
  85 enum requeststatus bre5(struct request *rq,
  86     int plexno,
  87     vinum_off_t * diskstart,
  88     vinum_off_t diskend);
  89 void complete_raid5_write(struct rqelement *);
  90 enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
  91 void setrqebounds(struct rqelement *rqe, struct metrics *mp);
  92
  93 /*
  94  * define the low-level requests needed to perform
  95  * a high-level I/O operation for a specific plex
  96  * 'plexno'.
  97  *
  98  * Return 0 if all subdisks involved in the
  99  * request are up, 1 if some subdisks are not up,
 100  * and -1 if the request is at least partially
 101  * outside the bounds of the subdisks.
 102  *
 103  * Modify the pointer *diskstart to point to the
 104  * end address.  On read, return on the first bad
 105  * subdisk, so that the caller
 106  * (build_read_request) can try alternatives.
 107  *
 108  * On entry to this routine, the prq structures
 109  * are not assigned.  The assignment is performed
 110  * by expandrq().  Strictly speaking, the elements
 111  * rqe->sdno of all entries should be set to -1,
 112  * since 0 (from bzero) is a valid subdisk number.
 113  * We avoid this problem by initializing the ones
 114  * we use, and not looking at the others (index >=
 115  * prq->requests).
 116  */
 117 enum requeststatus
 118 bre5(struct request *rq,
 119     int plexno,
 120     vinum_off_t * diskaddr,
 121     vinum_off_t diskend)
 122 {
 123     struct metrics m;                                       /* most of the information */
 124     struct sd *sd;
 125     struct plex *plex;
 126     struct bio *bio;                                        /* user's bp */
 127     struct buf *bp;
 128     struct rqgroup *rqg;                                    /* the request group that we will create */
 129     struct rqelement *rqe;                                  /* point to this request information */
 130     int rsectors;                                           /* sectors remaining in this stripe */
 131     int mysdno;                                             /* another sd index in loops */
 132     int rqno;                                               /* request number */
 133
 134     rqg = NULL;                                             /* shut up, damn compiler */
 135     m.diskstart = *diskaddr;                                /* start of transfer */
 136     bio = rq->bio;                                          /* buffer pointer */
 137     bp = bio->bio_buf;
 138     plex = &PLEX[plexno];                                   /* point to the plex */
 139
 140
 141     while (*diskaddr < diskend) {                           /* until we get it all sorted out */
 142         if (*diskaddr >= plex->length)                      /* beyond the end of the plex */
 143             return REQUEST_EOF;                             /* can't continue */
 144
 145         m.badsdno = -1;                                     /* no bad subdisk yet */
 146
 147         /* Part A: Define the request */
 148         /*
 149          * First, calculate some sizes:
 150          * The offset of the start address from
 151          * the start of the stripe.
 152          */
 153         m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1));
 154
 155         /*
 156          * The plex-relative address of the
 157          * start of the stripe.
 158          */
 159         m.stripebase = *diskaddr - m.stripeoffset;
 160
 161         /* subdisk containing the parity stripe */
 162         if (plex->organization == plex_raid5)
 163             m.psdno = plex->subdisks - 1
 164                 - (*diskaddr / (plex->stripesize * (plex->subdisks - 1)))
 165                 % plex->subdisks;
 166         else                                                /* RAID-4 */
 167             m.psdno = plex->subdisks - 1;
 168
 169         /*
 170          * The number of the subdisk in which
 171          * the start is located.
 172          */
 173         m.firstsdno = m.stripeoffset / plex->stripesize;
 174         if (m.firstsdno >= m.psdno)                         /* at or past parity sd */
 175             m.firstsdno++;                                  /* increment it */
 176
 177         /*
 178          * The offset from the beginning of
 179          * the stripe on this subdisk.
 180          */
 181         m.initoffset = m.stripeoffset % plex->stripesize;
 182
 183         /* The offset of the stripe start relative to this subdisk */
 184         m.sdbase = m.stripebase / (plex->subdisks - 1);
 185
 186         m.useroffset = *diskaddr - m.diskstart;             /* The offset of the start in the user buffer */
 187
 188         /*
 189          * The number of sectors to transfer in the
 190          * current (first) subdisk.
 191          */
 192         m.initlen = umin(diskend - *diskaddr,               /* the amount remaining to transfer */
 193             plex->stripesize - m.initoffset);               /* and the amount left in this block */
 194
 195         /*
 196          * The number of sectors to transfer in this stripe
 197          * is the minumum of the amount remaining to transfer
 198          * and the amount left in this stripe.
 199          */
 200         m.stripesectors = umin(diskend - *diskaddr,
 201             plex->stripesize * (plex->subdisks - 1) - m.stripeoffset);
 202
 203         /* The number of data subdisks involved in this request */
 204         m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize;
 205
 206         /* Part B: decide what kind of transfer this will be.
 207
 208          * start and end addresses of the transfer in
 209          * the current block.
 210          *
 211          * There are a number of different kinds of
 212          * transfer, each of which relates to a
 213          * specific subdisk:
 214          *
 215          * 1. Normal read.  All participating subdisks
 216          *    are up, and the transfer can be made
 217          *    directly to the user buffer.  The bounds
 218          *    of the transfer are described by
 219          *    m.dataoffset and m.datalen.  We have
 220          *    already calculated m.initoffset and
 221          *    m.initlen, which define the parameters
 222          *    for the first data block.
 223          *
 224          * 2. Recovery read.  One participating
 225          *    subdisk is down.  To recover data, all
 226          *    the other subdisks, including the parity
 227          *    subdisk, must be read.  The data is
 228          *    recovered by exclusive-oring all the
 229          *    other blocks.  The bounds of the
 230          *    transfer are described by m.groupoffset
 231          *    and m.grouplen.
 232          *
 233          * 3. A read request may request reading both
 234          *    available data (normal read) and
 235          *    non-available data (recovery read).
 236          *    This can be a problem if the address
 237          *    ranges of the two reads do not coincide:
 238          *    in this case, the normal read needs to
 239          *    be extended to cover the address range
 240          *    of the recovery read, and must thus be
 241          *    performed out of malloced memory.
 242          *
 243          * 4. Normal write.  All the participating
 244          *    subdisks are up.  The bounds of the
 245          *    transfer are described by m.dataoffset
 246          *    and m.datalen.  Since these values
 247          *    differ for each block, we calculate the
 248          *    bounds for the parity block
 249          *    independently as the maximum of the
 250          *    individual blocks and store these values
 251          *    in m.writeoffset and m.writelen.  This
 252          *    write proceeds in four phases:
 253          *
 254          *    i.  Read the old contents of each block
 255          *        and the parity block.
 256          *    ii.  ``Remove'' the old contents from
 257          *         the parity block with exclusive or.
 258          *    iii. ``Insert'' the new contents of the
 259          *          block in the parity block, again
 260          *          with exclusive or.
 261          *
 262          *    iv.  Write the new contents of the data
 263          *         blocks and the parity block.  The data
 264          *         block transfers can be made directly from
 265          *         the user buffer.
 266          *
 267          * 5. Degraded write where the data block is
 268          *    not available.  The bounds of the
 269          *    transfer are described by m.groupoffset
 270          *    and m.grouplen. This requires the
 271          *    following steps:
 272          *
 273          *    i.  Read in all the other data blocks,
 274          *        excluding the parity block.
 275          *
 276          *    ii.  Recreate the parity block from the
 277          *         other data blocks and the data to be
 278          *         written.
 279          *
 280          *    iii. Write the parity block.
 281          *
 282          * 6. Parityless write, a write where the
 283          *    parity block is not available.  This is
 284          *    in fact the simplest: just write the
 285          *    data blocks.  This can proceed directly
 286          *    from the user buffer.  The bounds of the
 287          *    transfer are described by m.dataoffset
 288          *    and m.datalen.
 289          *
 290          * 7. Combination of degraded data block write
 291          *    and normal write.  In this case the
 292          *    address ranges of the reads may also
 293          *    need to be extended to cover all
 294          *    participating blocks.
 295          *
 296          * All requests in a group transfer transfer
 297          * the same address range relative to their
 298          * subdisk.  The individual transfers may
 299          * vary, but since our group of requests is
 300          * all in a single slice, we can define a
 301          * range in which they all fall.
 302          *
 303          * In the following code section, we determine
 304          * which kind of transfer we will perform.  If
 305          * there is a group transfer, we also decide
 306          * its bounds relative to the subdisks.  At
 307          * the end, we have the following values:
 308          *
 309          *  m.flags indicates the kinds of transfers
 310          *    we will perform.
 311          *  m.initoffset indicates the offset of the
 312          *    beginning of any data operation relative
 313          *    to the beginning of the stripe base.
 314          *  m.initlen specifies the length of any data
 315          *    operation.
 316          *  m.dataoffset contains the same value as
 317          *    m.initoffset.
 318          *  m.datalen contains the same value as
 319          *    m.initlen.  Initially dataoffset and
 320          *    datalen describe the parameters for the
 321          *    first data block; while building the data
 322          *    block requests, they are updated for each
 323          *    block.
 324          *  m.groupoffset indicates the offset of any
 325          *    group operation relative to the beginning
 326          *    of the stripe base.
 327          *  m.grouplen specifies the length of any
 328          *    group operation.
 329          *  m.writeoffset indicates the offset of a
 330          *    normal write relative to the beginning of
 331          *    the stripe base.  This value differs from
 332          *    m.dataoffset in that it applies to the
 333          *    entire operation, and not just the first
 334          *    block.
 335          *  m.writelen specifies the total span of a
 336          *    normal write operation.  writeoffset and
 337          *    writelen are used to define the parity
 338          *    block.
 339          */
 340         m.groupoffset = 0;                                  /* assume no group... */
 341         m.grouplen = 0;                                     /* until we know we have one */
 342         m.writeoffset = m.initoffset;                       /* start offset of transfer */
 343         m.writelen = 0;                                     /* nothing to write yet */
 344         m.flags = 0;                                        /* no flags yet */
 345         rsectors = m.stripesectors;                         /* remaining sectors to examine */
 346         m.dataoffset = m.initoffset;                        /* start at the beginning of the transfer */
 347         m.datalen = m.initlen;
 348
 349         if (m.sdcount > 1) {
 350             plex->multiblock++;                             /* more than one block for the request */
 351             /*
 352              * If we have two transfers that don't overlap,
 353              * (one at the end of the first block, the other
 354              * at the beginning of the second block),
 355              * it's cheaper to split them.
 356              */
 357             if (rsectors < plex->stripesize) {
 358                 m.sdcount = 1;                              /* just one subdisk */
 359                 m.stripesectors = m.initlen;                /* and just this many sectors */
 360                 rsectors = m.initlen;                       /* and in the loop counter */
 361             }
 362         }
 363         if (SD[plex->sdnos[m.psdno]].state < sd_reborn)     /* is our parity subdisk down? */
 364             m.badsdno = m.psdno;                            /* note that it's down */
 365         if (bp->b_cmd == BUF_CMD_READ) {                    /* read operation */
 366             for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
 367                 if (mysdno == m.psdno)                      /* ignore parity on read */
 368                     mysdno++;
 369                 if (mysdno == plex->subdisks)               /* wraparound */
 370                     mysdno = 0;
 371                 if (mysdno == m.psdno)                      /* parity, */
 372                     mysdno++;                               /* we've given already */
 373
 374                 if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */
 375                     if (m.badsdno >= 0)                     /* we had one already, */
 376                         return REQUEST_DOWN;                /* we can't take a second */
 377                     m.badsdno = mysdno;                     /* got the first */
 378                     m.groupoffset = m.dataoffset;           /* define the bounds */
 379                     m.grouplen = m.datalen;
 380                     m.flags |= XFR_RECOVERY_READ;           /* we need recovery */
 381                     plex->recovered_reads++;                /* count another one */
 382                 } else
 383                     m.flags |= XFR_NORMAL_READ;             /* normal read */
 384
 385                 /* Update the pointers for the next block */
 386                 m.dataoffset = 0;                           /* back to the start of the stripe */
 387                 rsectors -= m.datalen;                      /* remaining sectors to examine */
 388                 m.datalen = umin(rsectors, plex->stripesize); /* amount that will fit in this block */
 389             }
 390         } else {                                            /* write operation */
 391             for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
 392                 if (mysdno == m.psdno)                      /* parity stripe, we've dealt with that */
 393                     mysdno++;
 394                 if (mysdno == plex->subdisks)               /* wraparound */
 395                     mysdno = 0;
 396                 if (mysdno == m.psdno)                      /* parity, */
 397                     mysdno++;                               /* we've given already */
 398
 399                 sd = &SD[plex->sdnos[mysdno]];
 400                 if (sd->state != sd_up) {
 401                     enum requeststatus s;
 402
 403                     s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
 404                     if (s && (m.badsdno >= 0)) {            /* second bad disk, */
 405                         int sdno;
 406                         /*
 407                          * If the parity disk is down, there's
 408                          * no recovery.  We make all involved
 409                          * subdisks stale.  Otherwise, we
 410                          * should be able to recover, but it's
 411                          * like pulling teeth.  Fix it later.
 412                          */
 413                         for (sdno = 0; sdno < m.sdcount; sdno++) {
 414                             struct sd *sd = &SD[plex->sdnos[sdno]];
 415                             if (sd->state >= sd_reborn)     /* sort of up, */
 416                                 set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */
 417                         }
 418                         return s;                           /* and crap out */
 419                     }
 420                     m.badsdno = mysdno;                     /* note which one is bad */
 421                     m.flags |= XFR_DEGRADED_WRITE;          /* we need recovery */
 422                     plex->degraded_writes++;                /* count another one */
 423                     m.groupoffset = m.dataoffset;           /* define the bounds */
 424                     m.grouplen = m.datalen;
 425                 } else {
 426                     m.flags |= XFR_NORMAL_WRITE;            /* normal write operation */
 427                     if (m.writeoffset > m.dataoffset) {     /* move write operation lower */
 428                         m.writelen = umax(m.writeoffset + m.writelen,
 429                             m.dataoffset + m.datalen)
 430                             - m.dataoffset;
 431                         m.writeoffset = m.dataoffset;
 432                     } else
 433                         m.writelen = umax(m.writeoffset + m.writelen,
 434                             m.dataoffset + m.datalen)
 435                             - m.writeoffset;
 436                 }
 437
 438                 /* Update the pointers for the next block */
 439                 m.dataoffset = 0;                           /* back to the start of the stripe */
 440                 rsectors -= m.datalen;                      /* remaining sectors to examine */
 441                 m.datalen = umin(rsectors, plex->stripesize); /* amount that will fit in this block */
 442             }
 443             if (m.badsdno == m.psdno) {                     /* got a bad parity block, */
 444                 struct sd *psd = &SD[plex->sdnos[m.psdno]];
 445
 446                 if (psd->state == sd_down)
 447                     set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */
 448                 else if (psd->state == sd_crashed)
 449                     set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */
 450                 m.flags &= ~XFR_NORMAL_WRITE;               /* this write isn't normal, */
 451                 m.flags |= XFR_PARITYLESS_WRITE;            /* it's parityless */
 452                 plex->parityless_writes++;                  /* count another one */
 453             }
 454         }
 455
 456         /* reset the initial transfer values */
 457         m.dataoffset = m.initoffset;                        /* start at the beginning of the transfer */
 458         m.datalen = m.initlen;
 459
 460         /* decide how many requests we need */
 461         if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))
 462             /* doing a recovery read or degraded write, */
 463             m.rqcount = plex->subdisks;                     /* all subdisks */
 464         else if (m.flags & XFR_NORMAL_WRITE)                /* normal write, */
 465             m.rqcount = m.sdcount + 1;                      /* all data blocks and the parity block */
 466         else                                                /* parityless write or normal read */
 467             m.rqcount = m.sdcount;                          /* just the data blocks */
 468
 469         /* Part C: build the requests */
 470         rqg = allocrqg(rq, m.rqcount);                      /* get a request group */
 471         if (rqg == NULL) {                                  /* malloc failed */
 472             bp->b_error = ENOMEM;
 473             bp->b_flags |= B_ERROR;
 474             return REQUEST_ENOMEM;
 475         }
 476         rqg->plexno = plexno;
 477         rqg->flags = m.flags;
 478         rqno = 0;                                           /* index in the request group */
 479
 480         /* 1: PARITY BLOCK */
 481         /*
 482          * Are we performing an operation which requires parity?  In that case,
 483          * work out the parameters and define the parity block.
 484          * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE
 485          */
 486         if (m.flags & XFR_PARITYOP) {                       /* need parity */
 487             rqe = &rqg->rqe[rqno];                          /* point to element */
 488             sd = &SD[plex->sdnos[m.psdno]];                 /* the subdisk in question */
 489             rqe->rqg = rqg;                                 /* point back to group */
 490             rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */
 491             &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE);     /* transfer flags without data op stuf */
 492             setrqebounds(rqe, &m);                          /* set up the bounds of the transfer */
 493             rqe->sdno = sd->sdno;                           /* subdisk number */
 494             rqe->driveno = sd->driveno;
 495             if (build_rq_buffer(rqe, plex))                 /* build the buffer */
 496                 return REQUEST_ENOMEM;                      /* can't do it */
 497             rqe->b.b_cmd = BUF_CMD_READ;                    /* we must read first */
 498             m.sdcount++;                                    /* adjust the subdisk count */
 499             rqno++;                                         /* and point to the next request */
 500         }
 501         /*
 502          * 2: DATA BLOCKS
 503          * Now build up requests for the blocks required
 504          * for individual transfers
 505          */
 506         for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) {
 507             if (mysdno == m.psdno)                          /* parity, */
 508                 mysdno++;                                   /* we've given already */
 509             if (mysdno == plex->subdisks)                   /* got to the end, */
 510                 mysdno = 0;                                 /* wrap around */
 511             if (mysdno == m.psdno)                          /* parity, */
 512                 mysdno++;                                   /* we've given already */
 513
 514             rqe = &rqg->rqe[rqno];                          /* point to element */
 515             sd = &SD[plex->sdnos[mysdno]];                  /* the subdisk in question */
 516             rqe->rqg = rqg;                                 /* point to group */
 517             if (m.flags & XFR_NEEDS_MALLOC)                 /* we need a malloced buffer first */
 518                 rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */
 519             else
 520                 rqe->flags = m.flags | XFR_DATA_BLOCK;      /* transfer flags */
 521             if (mysdno == m.badsdno) {                      /* this is the bad subdisk */
 522                 rqg->badsdno = rqno;                        /* note which one */
 523                 rqe->flags |= XFR_BAD_SUBDISK;              /* note that it's dead */
 524                 /*
 525                  * we can't read or write from/to it,
 526                  * but we don't need to malloc
 527                  */
 528                 rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE);
 529             }
 530             setrqebounds(rqe, &m);                          /* set up the bounds of the transfer */
 531             rqe->useroffset = m.useroffset;                 /* offset in user buffer */
 532             rqe->sdno = sd->sdno;                           /* subdisk number */
 533             rqe->driveno = sd->driveno;
 534             if (build_rq_buffer(rqe, plex))                 /* build the buffer */
 535                 return REQUEST_ENOMEM;                      /* can't do it */
 536             if ((m.flags & XFR_PARITYOP)                    /* parity operation, */
 537             &&((m.flags & XFR_BAD_SUBDISK) == 0))           /* and not the bad subdisk, */
 538                 rqe->b.b_cmd = BUF_CMD_READ;                /* we must read first */
 539
 540             /* Now update pointers for the next block */
 541             *diskaddr += m.datalen;                         /* skip past what we've done */
 542             m.stripesectors -= m.datalen;                   /* deduct from what's left */
 543             m.useroffset += m.datalen;                      /* and move on in the user buffer */
 544             m.datalen = umin(m.stripesectors, plex->stripesize);        /* and recalculate */
 545             m.dataoffset = 0;                               /* start at the beginning of next block */
 546         }
 547
 548         /*
 549          * 3: REMAINING BLOCKS FOR RECOVERY
 550          * Finally, if we have a recovery operation, build
 551          * up transfers for the other subdisks.  Follow the
 552          * subdisks around until we get to where we started.
 553          * These requests use only the group parameters.
 554          */
 555         if ((rqno < m.rqcount)                              /* haven't done them all already */
 556         &&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) {
 557             for (; rqno < m.rqcount; rqno++, mysdno++) {
 558                 if (mysdno == m.psdno)                      /* parity, */
 559                     mysdno++;                               /* we've given already */
 560                 if (mysdno == plex->subdisks)               /* got to the end, */
 561                     mysdno = 0;                             /* wrap around */
 562                 if (mysdno == m.psdno)                      /* parity, */
 563                     mysdno++;                               /* we've given already */
 564
 565                 rqe = &rqg->rqe[rqno];                      /* point to element */
 566                 sd = &SD[plex->sdnos[mysdno]];              /* the subdisk in question */
 567                 rqe->rqg = rqg;                             /* point to group */
 568
 569                 rqe->sdoffset = m.sdbase + m.groupoffset;   /* start of transfer */
 570                 rqe->dataoffset = 0;                        /* for tidiness' sake */
 571                 rqe->groupoffset = 0;                       /* group starts at the beginining */
 572                 rqe->datalen = 0;
 573                 rqe->grouplen = m.grouplen;
 574                 rqe->buflen = m.grouplen;
 575                 rqe->flags = (m.flags | XFR_MALLOCED)       /* transfer flags without data op stuf */
 576                 &~XFR_DATAOP;
 577                 rqe->sdno = sd->sdno;                       /* subdisk number */
 578                 rqe->driveno = sd->driveno;
 579                 if (build_rq_buffer(rqe, plex))             /* build the buffer */
 580                     return REQUEST_ENOMEM;                  /* can't do it */
 581                 rqe->b.b_cmd = BUF_CMD_READ;                /* we must read first */
 582             }
 583         }
 584         /*
 585          * We need to lock the address range before
 586          * doing anything.  We don't have to be
 587          * performing a recovery operation: somebody
 588          * else could be doing so, and the results could
 589          * influence us.  Note the fact here, we'll perform
 590          * the lock in launch_requests.
 591          */
 592         rqg->lockbase = m.stripebase;
 593         if (*diskaddr < diskend)                            /* didn't finish the request on this stripe */
 594             plex->multistripe++;                            /* count another one */
 595     }
 596     return REQUEST_OK;
 597 }
 598
 599 /*
 600  * Helper function for rqe5: adjust the bounds of
 601  * the transfers to minimize the buffer
 602  * allocation.
 603  *
 604  * Each request can handle two of three different
 605  * data ranges:
 606  *
 607  * 1.  The range described by the parameters
 608  *     dataoffset and datalen, for normal read or
 609  *     parityless write.
 610  * 2.  The range described by the parameters
 611  *     groupoffset and grouplen, for recovery read
 612  *     and degraded write.
 613  * 3.  For normal write, the range depends on the
 614  *     kind of block.  For data blocks, the range
 615  *     is defined by dataoffset and datalen.  For
 616  *     parity blocks, it is defined by writeoffset
 617  *     and writelen.
 618  *
 619  * In order not to allocate more memory than
 620  * necessary, this function adjusts the bounds
 621  * parameter for each request to cover just the
 622  * minimum necessary for the function it performs.
 623  * This will normally vary from one request to the
 624  * next.
 625  *
 626  * Things are slightly different for the parity
 627  * block.  In this case, the bounds defined by
 628  * mp->writeoffset and mp->writelen also play a
 629  * rôle.  Select this case by setting the
 630  * parameter forparity != 0
 631  */
 632 void
 633 setrqebounds(struct rqelement *rqe, struct metrics *mp)
 634 {
 635     /* parity block of a normal write */
 636     if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK))
 637         == (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) {         /* case 3 */
 638         if (rqe->flags & XFR_DEGRADED_WRITE) {              /* also degraded write */
 639             /*
 640              * With a combined normal and degraded write, we
 641              * will zero out the area of the degraded write
 642              * in the second phase, so we don't need to read
 643              * it in.  Unfortunately, we need a way to tell
 644              * build_request_buffer the size of the buffer,
 645              * and currently that's the length of the read.
 646              * As a result, we read everything, even the stuff
 647              * that we're going to nuke.
 648              * FIXME XXX
 649              */
 650             if (mp->groupoffset < mp->writeoffset) {        /* group operation starts lower */
 651                 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
 652                 rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */
 653                 rqe->groupoffset = 0;                       /* and the group at the beginning */
 654             } else {                                        /* individual data starts first */
 655                 rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */
 656                 rqe->dataoffset = 0;                        /* individual data starts at the beginning */
 657                 rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */
 658             }
 659             rqe->datalen = mp->writelen;
 660             rqe->grouplen = mp->grouplen;
 661         } else {                                            /* just normal write (case 3) */
 662             rqe->sdoffset = mp->sdbase + mp->writeoffset;   /* start of transfer */
 663             rqe->dataoffset = 0;                            /* degradation starts at the beginning */
 664             rqe->groupoffset = 0;                           /* for tidiness' sake */
 665             rqe->datalen = mp->writelen;
 666             rqe->grouplen = 0;
 667         }
 668     } else if (rqe->flags & XFR_DATAOP) {                   /* data operation (case 1 or 3) */
 669         if (rqe->flags & XFR_GROUPOP) {                     /* also a group operation (case 2) */
 670             if (mp->groupoffset < mp->dataoffset) {         /* group operation starts lower */
 671                 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
 672                 rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */
 673                 rqe->groupoffset = 0;                       /* and the group at the beginning */
 674             } else {                                        /* individual data starts first */
 675                 rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */
 676                 rqe->dataoffset = 0;                        /* individual data starts at the beginning */
 677                 rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */
 678             }
 679             rqe->datalen = mp->datalen;
 680             rqe->grouplen = mp->grouplen;
 681         } else {                                            /* just data operation (case 1) */
 682             rqe->sdoffset = mp->sdbase + mp->dataoffset;    /* start of transfer */
 683             rqe->dataoffset = 0;                            /* degradation starts at the beginning */
 684             rqe->groupoffset = 0;                           /* for tidiness' sake */
 685             rqe->datalen = mp->datalen;
 686             rqe->grouplen = 0;
 687         }
 688     } else {                                                /* just group operations (case 2) */
 689         rqe->sdoffset = mp->sdbase + mp->groupoffset;       /* start of transfer */
 690         rqe->dataoffset = 0;                                /* for tidiness' sake */
 691         rqe->groupoffset = 0;                               /* group starts at the beginining */
 692         rqe->datalen = 0;
 693         rqe->grouplen = mp->grouplen;
 694     }
 695     rqe->buflen = umax(rqe->dataoffset + rqe->datalen,      /* total buffer length */
 696         rqe->groupoffset + rqe->grouplen);
 697 }