Remove some emacs variable settings.
[dragonfly.git] / sys / dev / raid / vinum / vinuminterrupt.c
1 /* vinuminterrupt.c: bottom half of the driver */
2
3 /*-
4  * Copyright (c) 1997, 1998, 1999
5  *      Nan Yang Computer Services Limited.  All rights reserved.
6  *
7  *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
8  *
9  *  Written by Greg Lehey
10  *
11  *  This software is distributed under the so-called ``Berkeley
12  *  License'':
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *      This product includes software developed by Nan Yang Computer
25  *      Services Limited.
26  * 4. Neither the name of the Company nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * This software is provided ``as is'', and any express or implied
31  * warranties, including, but not limited to, the implied warranties of
32  * merchantability and fitness for a particular purpose are disclaimed.
33  * In no event shall the company or contributors be liable for any
34  * direct, indirect, incidental, special, exemplary, or consequential
35  * damages (including, but not limited to, procurement of substitute
36  * goods or services; loss of use, data, or profits; or business
37  * interruption) however caused and on any theory of liability, whether
38  * in contract, strict liability, or tort (including negligence or
39  * otherwise) arising in any way out of the use of this software, even if
40  * advised of the possibility of such damage.
41  *
42  * $Id: vinuminterrupt.c,v 1.12 2000/11/24 03:41:42 grog Exp grog $
43  * $FreeBSD: src/sys/dev/vinum/vinuminterrupt.c,v 1.25.2.3 2001/05/28 05:56:27 grog Exp $
44  */
45
46 #include "vinumhdr.h"
47 #include "request.h"
48 #include <sys/resourcevar.h>
49
50 void complete_raid5_write(struct rqelement *);
51 void complete_rqe(struct bio *bio);
52 void sdio_done(struct bio *bio);
53
54 /*
55  * Take a completed buffer, transfer the data back if
56  * it's a read, and complete the high-level request
57  * if this is the last subrequest.
58  *
59  * The bp parameter is in fact a struct rqelement, which
60  * includes a couple of extras at the end.
61  */
62 void
63 complete_rqe(struct bio *bio)
64 {
65     union daemoninfo di;
66     struct buf *bp = bio->bio_buf;
67     struct rqelement *rqe;
68     struct request *rq;
69     struct rqgroup *rqg;
70     struct bio *ubio;                                       /* user buffer */
71     struct drive *drive;
72     struct sd *sd;
73     char *gravity;                                          /* for error messages */
74
75     get_mplock();
76
77     rqe = (struct rqelement *) bp;                          /* point to the element that completed */
78     rqg = rqe->rqg;                                         /* and the request group */
79     rq = rqg->rq;                                           /* and the complete request */
80     ubio = rq->bio;                                         /* user buffer */
81
82 #ifdef VINUMDEBUG
83     if (debug & DEBUG_LASTREQS)
84         logrq(loginfo_iodone, (union rqinfou) rqe, ubio);
85 #endif
86     drive = &DRIVE[rqe->driveno];
87     drive->active--;                                        /* one less outstanding I/O on this drive */
88     vinum_conf.active--;                                    /* one less outstanding I/O globally */
89     if ((drive->active == (DRIVE_MAXACTIVE - 1))            /* we were at the drive limit */
90     ||(vinum_conf.active == VINUM_MAXACTIVE))               /* or the global limit */
91         wakeup(&launch_requests);                           /* let another one at it */
92     if ((bp->b_flags & B_ERROR) != 0) {                     /* transfer in error */
93         gravity = "";
94         sd = &SD[rqe->sdno];
95
96         if (bp->b_error != 0)                               /* did it return a number? */
97             rq->error = bp->b_error;                        /* yes, put it in. */
98         else if (rq->error == 0)                            /* no: do we have one already? */
99             rq->error = EIO;                                /* no: catchall "I/O error" */
100         sd->lasterror = rq->error;
101         if (bp->b_cmd == BUF_CMD_READ) {
102             if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
103                 gravity = " fatal";
104                 set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
105             }
106             log(LOG_ERR,
107                 "%s:%s read error, offset %lld for %d bytes\n",
108                 gravity,
109                 sd->name,
110                 (long long)bio->bio_offset,
111                 bp->b_bcount);
112         } else {                                            /* write operation */
113             if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
114                 gravity = "fatal ";
115                 set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
116             }
117             log(LOG_ERR,
118                 "%s:%s write error, offset %lld for %d bytes\n",
119                 gravity,
120                 sd->name,
121                 (long long)bio->bio_offset,
122                 bp->b_bcount);
123         }
124         log(LOG_ERR,
125             "%s: user buffer offset %lld for %d bytes\n",
126             sd->name,
127             (long long)ubio->bio_offset,
128             ubio->bio_buf->b_bcount);
129         if (rq->error == ENXIO) {                           /* the drive's down too */
130             log(LOG_ERR,
131                 "%s: fatal drive I/O error, offset %lld for %d bytes\n",
132                 DRIVE[rqe->driveno].label.name,
133                 (long long)bio->bio_offset,
134                 bp->b_bcount);
135             DRIVE[rqe->driveno].lasterror = rq->error;
136             set_drive_state(rqe->driveno,                   /* take the drive down */
137                 drive_down,
138                 setstate_force);
139         }
140     }
141     /* Now update the statistics */
142     if (bp->b_cmd == BUF_CMD_READ) {                            /* read operation */
143         DRIVE[rqe->driveno].reads++;
144         DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
145         SD[rqe->sdno].reads++;
146         SD[rqe->sdno].bytes_read += bp->b_bcount;
147         PLEX[rqe->rqg->plexno].reads++;
148         PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
149         if (PLEX[rqe->rqg->plexno].volno >= 0) {            /* volume I/O, not plex */
150             VOL[PLEX[rqe->rqg->plexno].volno].reads++;
151             VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount;
152         }
153     } else {                                                /* write operation */
154         DRIVE[rqe->driveno].writes++;
155         DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
156         SD[rqe->sdno].writes++;
157         SD[rqe->sdno].bytes_written += bp->b_bcount;
158         PLEX[rqe->rqg->plexno].writes++;
159         PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
160         if (PLEX[rqe->rqg->plexno].volno >= 0) {            /* volume I/O, not plex */
161             VOL[PLEX[rqe->rqg->plexno].volno].writes++;
162             VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount;
163         }
164     }
165     if (rqg->flags & XFR_RECOVERY_READ) {                   /* recovery read, */
166         int *sdata;                                         /* source */
167         int *data;                                          /* and group data */
168         int length;                                         /* and count involved */
169         int count;                                          /* loop counter */
170         struct rqelement *urqe = &rqg->rqe[rqg->badsdno];   /* rqe of the bad subdisk */
171
172         /* XOR destination is the user data */
173         sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */
174         data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
175         length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
176
177         for (count = 0; count < length; count++)
178             data[count] ^= sdata[count];
179
180         /*
181          * In a normal read, we will normally read directly
182          * into the user buffer.  This doesn't work if
183          * we're also doing a recovery, so we have to
184          * copy it
185          */
186         if (rqe->flags & XFR_NORMAL_READ) {                 /* normal read as well, */
187             char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
188             char *dst;
189
190             dst = (char *) ubio->bio_buf->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
191             length = rqe->datalen << DEV_BSHIFT;            /* and count involved */
192             bcopy(src, dst, length);                        /* move it */
193         }
194     } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation  */
195     &&(rqg->active == 1))                                   /* and this is the last active request */
196         complete_raid5_write(rqe);
197     /*
198      * This is the earliest place where we can be
199      * sure that the request has really finished,
200      * since complete_raid5_write can issue new
201      * requests.
202      */
203     rqg->active--;                                          /* this request now finished */
204     if (rqg->active == 0) {                                 /* request group finished, */
205         rq->active--;                                       /* one less */
206         if (rqg->lock) {                                    /* got a lock? */
207             unlockrange(rqg->plexno, rqg->lock);            /* yes, free it */
208             rqg->lock = 0;
209         }
210     }
211     if (rq->active == 0) {                                  /* request finished, */
212 #ifdef VINUMDEBUG
213         if (debug & DEBUG_RESID) {
214             if (ubio->bio_buf->b_resid != 0)                        /* still something to transfer? */
215                 Debugger("resid");
216         }
217 #endif
218
219         if (rq->error) {                                    /* did we have an error? */
220             if (rq->isplex) {                               /* plex operation, */
221                 ubio->bio_buf->b_flags |= B_ERROR;          /* yes, propagate to user */
222                 ubio->bio_buf->b_error = rq->error;
223             } else {                                        /* try to recover */
224                 di.rq = rq;
225                 queue_daemon_request(daemonrq_ioerror, di); /* let the daemon complete */
226             }
227         } else {
228             ubio->bio_buf->b_resid = 0;                     /* completed our transfer */
229             if (rq->isplex == 0)                            /* volume request, */
230                 VOL[rq->volplex.volno].active--;            /* another request finished */
231             biodone(ubio);                                  /* top level buffer completed */
232             freerq(rq);                                     /* return the request storage */
233         }
234     }
235     rel_mplock();
236 }
237
238 /* Free a request block and anything hanging off it */
239 void
240 freerq(struct request *rq)
241 {
242     struct rqgroup *rqg;
243     struct rqgroup *nrqg;                                   /* next in chain */
244     int rqno;
245
246     for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) {          /* through the whole request chain */
247         if (rqg->lock)                                      /* got a lock? */
248             unlockrange(rqg->plexno, rqg->lock);            /* yes, free it */
249         for (rqno = 0; rqno < rqg->count; rqno++) {
250             if ((rqg->rqe[rqno].flags & XFR_MALLOCED)       /* data buffer was malloced, */
251             &&rqg->rqe[rqno].b.b_data)                      /* and the allocation succeeded */
252                 Free(rqg->rqe[rqno].b.b_data);              /* free it */
253             if (rqg->rqe[rqno].flags & XFR_BUFLOCKED) {     /* locked this buffer, */
254                 BUF_UNLOCK(&rqg->rqe[rqno].b);              /* unlock it again */
255                 uninitbufbio(&rqg->rqe[rqno].b);
256             }
257         }
258         nrqg = rqg->next;                                   /* note the next one */
259         Free(rqg);                                          /* and free this one */
260     }
261     Free(rq);                                               /* free the request itself */
262 }
263
264 /* I/O on subdisk completed */
265 void
266 sdio_done(struct bio *bio)
267 {
268     struct sdbuf *sbp;
269
270     get_mplock();
271
272     sbp = (struct sdbuf *) bio->bio_buf;
273     if (sbp->b.b_flags & B_ERROR) {                         /* had an error */
274         sbp->bio->bio_buf->b_flags |= B_ERROR;                      /* propagate upwards */
275         sbp->bio->bio_buf->b_error = sbp->b.b_error;
276     }
277 #ifdef VINUMDEBUG
278     if (debug & DEBUG_LASTREQS)
279         logrq(loginfo_sdiodone, (union rqinfou)bio, bio);
280 #endif
281     sbp->bio->bio_buf->b_resid = sbp->b.b_resid;                            /* copy the resid field */
282     /* Now update the statistics */
283     if (sbp->b.b_cmd == BUF_CMD_READ) {                     /* read operation */
284         DRIVE[sbp->driveno].reads++;
285         DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
286         SD[sbp->sdno].reads++;
287         SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
288     } else {                                                /* write operation */
289         DRIVE[sbp->driveno].writes++;
290         DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
291         SD[sbp->sdno].writes++;
292         SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
293     }
294     biodone_sync(bio);
295     biodone(sbp->bio);                                      /* complete the caller's I/O */
296     BUF_UNLOCK(&sbp->b);
297     uninitbufbio(&sbp->b);
298     Free(sbp);
299     rel_mplock();
300 }
301
302 /* Start the second phase of a RAID-4 or RAID-5 group write operation. */
303 void
304 complete_raid5_write(struct rqelement *rqe)
305 {
306     int *sdata;                                             /* source */
307     int *pdata;                                             /* and parity block data */
308     int length;                                             /* and count involved */
309     int count;                                              /* loop counter */
310     int rqno;                                               /* request index */
311     int rqoffset;                                           /* offset of request data from parity data */
312     struct bio *ubio;                                       /* user buffer header */
313     struct request *rq;                                     /* pointer to our request */
314     struct rqgroup *rqg;                                    /* and to the request group */
315     struct rqelement *prqe;                                 /* point to the parity block */
316     struct drive *drive;                                    /* drive to access */
317     rqg = rqe->rqg;                                         /* and to our request group */
318     rq = rqg->rq;                                           /* point to our request */
319     ubio = rq->bio;                                         /* user's buffer header */
320     prqe = &rqg->rqe[0];                                    /* point to the parity block */
321
322     /*
323      * If we get to this function, we have normal or
324      * degraded writes, or a combination of both.  We do
325      * the same thing in each case: we perform an
326      * exclusive or to the parity block.  The only
327      * difference is the origin of the data and the
328      * address range.
329      */
330     if (rqe->flags & XFR_DEGRADED_WRITE) {                  /* do the degraded write stuff */
331         pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
332         bzero(pdata, prqe->grouplen << DEV_BSHIFT);         /* start with nothing in the parity block */
333
334         /* Now get what data we need from each block */
335         for (rqno = 1; rqno < rqg->count; rqno++) {         /* for all the data blocks */
336             rqe = &rqg->rqe[rqno];                          /* this request */
337             sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
338             length = rqe->grouplen << (DEV_BSHIFT - 2);     /* and count involved */
339
340             /*
341              * Add the data block to the parity block.  Before
342              * we started the request, we zeroed the parity
343              * block, so the result of adding all the other
344              * blocks and the block we want to write will be
345              * the correct parity block.
346              */
347             for (count = 0; count < length; count++)
348                 pdata[count] ^= sdata[count];
349             if ((rqe->flags & XFR_MALLOCED)                 /* the buffer was malloced, */
350             &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) {     /* and we have no normal write, */
351                 Free(rqe->b.b_data);                        /* free it now */
352                 rqe->flags &= ~XFR_MALLOCED;
353             }
354         }
355     }
356     if (rqg->flags & XFR_NORMAL_WRITE) {                    /* do normal write stuff */
357         /* Get what data we need from each block */
358         for (rqno = 1; rqno < rqg->count; rqno++) {         /* for all the data blocks */
359             rqe = &rqg->rqe[rqno];                          /* this request */
360             if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
361                 == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) {   /* good data block to write */
362                 sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
363                 rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
364                 pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
365                 length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
366
367                 /*
368                  * "remove" the old data block
369                  * from the parity block
370                  */
371                 if ((pdata < ((int *) prqe->b.b_data))
372                     || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
373                     || (sdata < ((int *) rqe->b.b_data))
374                     || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
375                     panic("complete_raid5_write: bounds overflow");
376                 for (count = 0; count < length; count++)
377                     pdata[count] ^= sdata[count];
378
379                 /* "add" the new data block */
380                 sdata = (int *) (&ubio->bio_buf->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
381                 if ((sdata < ((int *) ubio->bio_buf->b_data))
382                     || (&sdata[length] > ((int *) (ubio->bio_buf->b_data + ubio->bio_buf->b_bcount))))
383                     panic("complete_raid5_write: bounds overflow");
384                 for (count = 0; count < length; count++)
385                     pdata[count] ^= sdata[count];
386
387                 /* Free the malloced buffer */
388                 if (rqe->flags & XFR_MALLOCED) {            /* the buffer was malloced, */
389                     Free(rqe->b.b_data);                    /* free it */
390                     rqe->flags &= ~XFR_MALLOCED;
391                 } else
392                     panic("complete_raid5_write: malloc conflict");
393
394                 if ((rqe->b.b_cmd == BUF_CMD_READ)          /* this was a read */
395                 &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) {  /* and we can write this block */
396                     rqe->b.b_cmd = BUF_CMD_WRITE;   /* we're writing now */
397                     rqe->b.b_bio1.bio_done = complete_rqe;          /* by calling us here */
398                     rqe->flags &= ~XFR_PARITYOP;            /* reset flags that brought us here */
399                     rqe->b.b_data = &ubio->bio_buf->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
400                     rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
401                     rqe->b.b_resid = rqe->b.b_bcount;       /* nothing transferred */
402                     rqe->b.b_bio1.bio_offset += (off_t)rqe->dataoffset << DEV_BSHIFT;       /* point to the correct block */
403                     drive = &DRIVE[rqe->driveno];           /* drive to access */
404                     rqe->b.b_bio1.bio_driver_info = drive->dev;
405                     rqg->active++;                          /* another active request */
406
407                                                             /* We can't sleep here, so we just increment the counters. */
408                     drive->active++;
409                     if (drive->active >= drive->maxactive)
410                         drive->maxactive = drive->active;
411                     vinum_conf.active++;
412                     if (vinum_conf.active >= vinum_conf.maxactive)
413                         vinum_conf.maxactive = vinum_conf.active;
414 #if VINUMDEBUG
415                     if (debug & DEBUG_ADDRESSES)
416                         log(LOG_DEBUG,
417                             "  %s dev %s, sd %d, offset 0x%llx, devoffset 0x%llx, length %d\n",
418                             (rqe->b.b_cmd == BUF_CMD_READ) ? "Read" : "Write",
419                             drive->devicename,
420                             rqe->sdno,
421                             rqe->b.b_bio1.bio_offset - ((off_t)SD[rqe->sdno].driveoffset << DEV_BSHIFT),
422                             rqe->b.b_bio1.bio_offset,
423                             rqe->b.b_bcount);
424                     if (debug & DEBUG_LASTREQS)
425                         logrq(loginfo_raid5_data, (union rqinfou) rqe, ubio);
426 #endif
427                     vn_strategy(drive->vp, &rqe->b.b_bio1);
428                 }
429             }
430         }
431     }
432     /* Finally, write the parity block */
433     rqe = &rqg->rqe[0];
434     rqe->b.b_cmd = BUF_CMD_WRITE;                   /* we're writing now */
435     rqe->b.b_bio1.bio_done = complete_rqe;                          /* by calling us here */
436     rqg->flags &= ~XFR_PARITYOP;                            /* reset flags that brought us here */
437     rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT;            /* length to write */
438     rqe->b.b_resid = rqe->b.b_bcount;                       /* nothing transferred */
439     drive = &DRIVE[rqe->driveno];                           /* drive to access */
440     rqe->b.b_bio1.bio_driver_info = drive->dev;
441     rqg->active++;                                          /* another active request */
442
443     /* We can't sleep here, so we just increment the counters. */
444     drive->active++;
445     if (drive->active >= drive->maxactive)
446         drive->maxactive = drive->active;
447     vinum_conf.active++;
448     if (vinum_conf.active >= vinum_conf.maxactive)
449         vinum_conf.maxactive = vinum_conf.active;
450
451 #if VINUMDEBUG
452     if (debug & DEBUG_ADDRESSES)
453         log(LOG_DEBUG,
454             "  %s dev %s, sd %d, offset 0x%llx, devoffset 0x%llx, length %d\n",
455             (rqe->b.b_cmd == BUF_CMD_READ) ? "Read" : "Write",
456             drive->devicename,
457             rqe->sdno,
458             rqe->b.b_bio1.bio_offset - ((off_t)SD[rqe->sdno].driveoffset << DEV_BSHIFT),
459             rqe->b.b_bio1.bio_offset,
460             rqe->b.b_bcount);
461     if (debug & DEBUG_LASTREQS)
462         logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubio);
463 #endif
464     vn_strategy(drive->vp, &rqe->b.b_bio1);
465 }