4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
29 #include <sys/zfs_context.h>
31 #include <sys/vdev_impl.h>
33 #include <sys/vdev_disk.h>
35 #include <sys/vdev_file.h>
36 #include <sys/vdev_raidz.h>
38 #include <sys/zio_checksum.h>
40 #include <sys/fs/zfs.h>
41 #include <sys/fm/fs/zfs.h>
45 #include <sys/vdev_initialize.h> /* vdev_xlate testing */
49 * Virtual device vector for RAID-Z.
51 * This vdev supports single, double, and triple parity. For single parity,
52 * we use a simple XOR of all the data columns. For double or triple parity,
53 * we use a special case of Reed-Solomon coding. This extends the
54 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
55 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
56 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
57 * former is also based. The latter is designed to provide higher performance
60 * Note that the Plank paper claimed to support arbitrary N+M, but was then
61 * amended six years later identifying a critical flaw that invalidates its
62 * claims. Nevertheless, the technique can be adapted to work for up to
63 * triple parity. For additional parity, the amendment "Note: Correction to
64 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
65 * is viable, but the additional complexity means that write performance will
68 * All of the methods above operate on a Galois field, defined over the
69 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
70 * can be expressed with a single byte. Briefly, the operations on the
71 * field are defined as follows:
73 * o addition (+) is represented by a bitwise XOR
74 * o subtraction (-) is therefore identical to addition: A + B = A - B
75 * o multiplication of A by 2 is defined by the following bitwise expression:
80 * (A * 2)_4 = A_3 + A_7
81 * (A * 2)_3 = A_2 + A_7
82 * (A * 2)_2 = A_1 + A_7
86 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
87 * As an aside, this multiplication is derived from the error correcting
88 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
90 * Observe that any number in the field (except for 0) can be expressed as a
91 * power of 2 -- a generator for the field. We store a table of the powers of
92 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
93 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
94 * than field addition). The inverse of a field element A (A^-1) is therefore
95 * A ^ (255 - 1) = A^254.
97 * The up-to-three parity columns, P, Q, R over several data columns,
98 * D_0, ... D_n-1, can be expressed by field operations:
100 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
101 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
102 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
103 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
104 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
106 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
107 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
108 * independent coefficients. (There are no additional coefficients that have
109 * this property which is why the uncorrected Plank method breaks down.)
111 * See the reconstruction code below for how P, Q and R can used individually
112 * or in concert to recover missing data columns.
115 typedef struct raidz_col {
116 uint64_t rc_devidx; /* child device index for I/O */
117 uint64_t rc_offset; /* device offset */
118 uint64_t rc_size; /* I/O size */
119 abd_t *rc_abd; /* I/O data */
120 void *rc_gdata; /* used to store the "good" version */
121 int rc_error; /* I/O error for this device */
122 uint8_t rc_tried; /* Did we attempt this I/O column? */
123 uint8_t rc_skipped; /* Did we skip this I/O column? */
126 typedef struct raidz_map {
127 uint64_t rm_cols; /* Regular column count */
128 uint64_t rm_scols; /* Count including skipped columns */
129 uint64_t rm_bigcols; /* Number of oversized columns */
130 uint64_t rm_asize; /* Actual total I/O size */
131 uint64_t rm_missingdata; /* Count of missing data devices */
132 uint64_t rm_missingparity; /* Count of missing parity devices */
133 uint64_t rm_firstdatacol; /* First data column/parity count */
134 uint64_t rm_nskip; /* Skipped sectors for padding */
135 uint64_t rm_skipstart; /* Column index of padding start */
136 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
137 uintptr_t rm_reports; /* # of referencing checksum reports */
138 uint8_t rm_freed; /* map no longer has referencing ZIO */
139 uint8_t rm_ecksuminjected; /* checksum error was injected */
140 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
143 #define VDEV_RAIDZ_P 0
144 #define VDEV_RAIDZ_Q 1
145 #define VDEV_RAIDZ_R 2
147 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
148 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
151 * We provide a mechanism to perform the field multiplication operation on a
152 * 64-bit value all at once rather than a byte at a time. This works by
153 * creating a mask from the top bit in each byte and using that to
154 * conditionally apply the XOR of 0x1d.
156 #define VDEV_RAIDZ_64MUL_2(x, mask) \
158 (mask) = (x) & 0x8080808080808080ULL; \
159 (mask) = ((mask) << 1) - ((mask) >> 7); \
160 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
161 ((mask) & 0x1d1d1d1d1d1d1d1d); \
164 #define VDEV_RAIDZ_64MUL_4(x, mask) \
166 VDEV_RAIDZ_64MUL_2((x), mask); \
167 VDEV_RAIDZ_64MUL_2((x), mask); \
170 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
173 * Force reconstruction to use the general purpose method.
175 int vdev_raidz_default_to_general;
177 /* Powers of 2 in the Galois field defined above. */
178 static const uint8_t vdev_raidz_pow2[256] = {
179 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
180 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
181 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
182 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
183 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
184 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
185 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
186 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
187 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
188 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
189 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
190 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
191 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
192 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
193 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
194 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
195 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
196 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
197 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
198 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
199 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
200 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
201 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
202 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
203 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
204 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
205 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
206 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
207 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
208 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
209 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
210 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
212 /* Logs of 2 in the Galois field defined above. */
213 static const uint8_t vdev_raidz_log2[256] = {
214 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
215 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
216 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
217 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
218 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
219 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
220 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
221 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
222 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
223 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
224 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
225 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
226 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
227 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
228 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
229 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
230 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
231 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
232 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
233 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
234 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
235 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
236 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
237 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
238 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
239 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
240 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
241 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
242 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
243 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
244 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
245 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
248 static void vdev_raidz_generate_parity(raidz_map_t *rm);
251 * Multiply a given number by 2 raised to the given power.
254 vdev_raidz_exp2(uint_t a, int exp)
260 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
262 exp += vdev_raidz_log2[a];
266 return (vdev_raidz_pow2[exp]);
270 vdev_raidz_map_free(raidz_map_t *rm)
274 for (c = 0; c < rm->rm_firstdatacol; c++) {
275 if (rm->rm_col[c].rc_abd != NULL)
276 abd_free(rm->rm_col[c].rc_abd);
278 if (rm->rm_col[c].rc_gdata != NULL)
279 zio_buf_free(rm->rm_col[c].rc_gdata,
280 rm->rm_col[c].rc_size);
283 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
284 if (rm->rm_col[c].rc_abd != NULL)
285 abd_put(rm->rm_col[c].rc_abd);
288 if (rm->rm_abd_copy != NULL)
289 abd_free(rm->rm_abd_copy);
291 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
295 vdev_raidz_map_free_vsd(zio_t *zio)
297 raidz_map_t *rm = zio->io_vsd;
299 ASSERT0(rm->rm_freed);
302 if (rm->rm_reports == 0)
303 vdev_raidz_map_free(rm);
308 vdev_raidz_cksum_free(void *arg, size_t ignored)
310 raidz_map_t *rm = arg;
312 ASSERT3U(rm->rm_reports, >, 0);
314 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
315 vdev_raidz_map_free(rm);
319 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
321 raidz_map_t *rm = zcr->zcr_cbdata;
322 size_t c = zcr->zcr_cbinfo;
325 const char *good = NULL;
328 if (good_data == NULL) {
329 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
333 if (c < rm->rm_firstdatacol) {
335 * The first time through, calculate the parity blocks for
336 * the good data (this relies on the fact that the good
337 * data never changes for a given logical ZIO)
339 if (rm->rm_col[0].rc_gdata == NULL) {
340 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
345 * Set up the rm_col[]s to generate the parity for
346 * good_data, first saving the parity bufs and
347 * replacing them with buffers to hold the result.
349 for (x = 0; x < rm->rm_firstdatacol; x++) {
350 bad_parity[x] = rm->rm_col[x].rc_abd;
351 rm->rm_col[x].rc_gdata =
352 zio_buf_alloc(rm->rm_col[x].rc_size);
353 rm->rm_col[x].rc_abd =
354 abd_get_from_buf(rm->rm_col[x].rc_gdata,
355 rm->rm_col[x].rc_size);
358 /* fill in the data columns from good_data */
359 buf = (char *)good_data;
360 for (; x < rm->rm_cols; x++) {
361 abd_put(rm->rm_col[x].rc_abd);
362 rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
363 rm->rm_col[x].rc_size);
364 buf += rm->rm_col[x].rc_size;
368 * Construct the parity from the good data.
370 vdev_raidz_generate_parity(rm);
372 /* restore everything back to its original state */
373 for (x = 0; x < rm->rm_firstdatacol; x++) {
374 abd_put(rm->rm_col[x].rc_abd);
375 rm->rm_col[x].rc_abd = bad_parity[x];
379 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
380 abd_put(rm->rm_col[x].rc_abd);
381 rm->rm_col[x].rc_abd = abd_get_offset(
382 rm->rm_abd_copy, offset);
383 offset += rm->rm_col[x].rc_size;
387 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
388 good = rm->rm_col[c].rc_gdata;
390 /* adjust good_data to point at the start of our column */
393 for (x = rm->rm_firstdatacol; x < c; x++)
394 good += rm->rm_col[x].rc_size;
397 bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
398 /* we drop the ereport if it ends up that the data was good */
399 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
400 abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
404 * Invoked indirectly by zfs_ereport_start_checksum(), called
405 * below when our read operation fails completely. The main point
406 * is to keep a copy of everything we read from disk, so that at
407 * vdev_raidz_cksum_finish() time we can compare it with the good data.
410 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
412 size_t c = (size_t)(uintptr_t)arg;
415 raidz_map_t *rm = zio->io_vsd;
418 /* set up the report and bump the refcount */
419 zcr->zcr_cbdata = rm;
421 zcr->zcr_finish = vdev_raidz_cksum_finish;
422 zcr->zcr_free = vdev_raidz_cksum_free;
425 ASSERT3U(rm->rm_reports, >, 0);
427 if (rm->rm_abd_copy != NULL)
431 * It's the first time we're called for this raidz_map_t, so we need
432 * to copy the data aside; there's no guarantee that our zio's buffer
433 * won't be re-used for something else.
435 * Our parity data is already in separate buffers, so there's no need
440 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
441 size += rm->rm_col[c].rc_size;
444 abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
446 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
447 raidz_col_t *col = &rm->rm_col[c];
448 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
450 abd_copy(tmp, col->rc_abd, col->rc_size);
451 abd_put(col->rc_abd);
454 offset += col->rc_size;
456 ASSERT3U(offset, ==, size);
459 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
460 vdev_raidz_map_free_vsd,
461 vdev_raidz_cksum_report
465 * Divides the IO evenly across all child vdevs; usually, dcols is
466 * the number of children in the target vdev.
469 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
470 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
473 /* The starting RAIDZ (parent) vdev sector of the block. */
474 uint64_t b = offset >> unit_shift;
475 /* The zio's size in units of the vdev's minimum sector size. */
476 uint64_t s = size >> unit_shift;
477 /* The first column for this stripe. */
478 uint64_t f = b % dcols;
479 /* The starting byte offset on each child vdev. */
480 uint64_t o = (b / dcols) << unit_shift;
481 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
485 * "Quotient": The number of data sectors for this stripe on all but
486 * the "big column" child vdevs that also contain "remainder" data.
488 q = s / (dcols - nparity);
491 * "Remainder": The number of partial stripe data sectors in this I/O.
492 * This will add a sector to some, but not all, child vdevs.
494 r = s - q * (dcols - nparity);
496 /* The number of "big columns" - those which contain remainder data. */
497 bc = (r == 0 ? 0 : r + nparity);
500 * The total number of data and parity sectors associated with
503 tot = s + nparity * (q + (r == 0 ? 0 : 1));
505 /* acols: The columns that will be accessed. */
506 /* scols: The columns that will be accessed or skipped. */
508 /* Our I/O request doesn't span all child vdevs. */
510 scols = MIN(dcols, roundup(bc, nparity + 1));
516 ASSERT3U(acols, <=, scols);
518 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
521 rm->rm_scols = scols;
523 rm->rm_skipstart = bc;
524 rm->rm_missingdata = 0;
525 rm->rm_missingparity = 0;
526 rm->rm_firstdatacol = nparity;
527 rm->rm_abd_copy = NULL;
530 rm->rm_ecksuminjected = 0;
534 for (c = 0; c < scols; c++) {
539 coff += 1ULL << unit_shift;
541 rm->rm_col[c].rc_devidx = col;
542 rm->rm_col[c].rc_offset = coff;
543 rm->rm_col[c].rc_abd = NULL;
544 rm->rm_col[c].rc_gdata = NULL;
545 rm->rm_col[c].rc_error = 0;
546 rm->rm_col[c].rc_tried = 0;
547 rm->rm_col[c].rc_skipped = 0;
550 rm->rm_col[c].rc_size = 0;
552 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
554 rm->rm_col[c].rc_size = q << unit_shift;
556 asize += rm->rm_col[c].rc_size;
559 ASSERT3U(asize, ==, tot << unit_shift);
560 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
561 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
562 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
563 ASSERT3U(rm->rm_nskip, <=, nparity);
566 for (c = 0; c < rm->rm_firstdatacol; c++) {
567 rm->rm_col[c].rc_abd =
568 abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
571 for (off = 0, c = rm->rm_firstdatacol; c < acols; c++) {
572 rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
573 off += rm->rm_col[c].rc_size;
578 * If all data stored spans all columns, there's a danger that parity
579 * will always be on the same device and, since parity isn't read
580 * during normal operation, that that device's I/O bandwidth won't be
581 * used effectively. We therefore switch the parity every 1MB.
583 * ... at least that was, ostensibly, the theory. As a practical
584 * matter unless we juggle the parity between all devices evenly, we
585 * won't see any benefit. Further, occasional writes that aren't a
586 * multiple of the LCM of the number of children and the minimum
587 * stripe width are sufficient to avoid pessimal behavior.
588 * Unfortunately, this decision created an implicit on-disk format
589 * requirement that we need to support for all eternity, but only
590 * for single-parity RAID-Z.
592 * If we intend to skip a sector in the zeroth column for padding
593 * we must make sure to note this swap. We will never intend to
594 * skip the first column since at least one data and one parity
595 * column must appear in each row.
597 ASSERT(rm->rm_cols >= 2);
598 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
600 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
601 devidx = rm->rm_col[0].rc_devidx;
602 o = rm->rm_col[0].rc_offset;
603 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
604 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
605 rm->rm_col[1].rc_devidx = devidx;
606 rm->rm_col[1].rc_offset = o;
608 if (rm->rm_skipstart == 0)
609 rm->rm_skipstart = 1;
622 vdev_raidz_p_func(void *buf, size_t size, void *private)
624 struct pqr_struct *pqr = private;
625 const uint64_t *src = buf;
626 int i, cnt = size / sizeof (src[0]);
628 ASSERT(pqr->p && !pqr->q && !pqr->r);
630 for (i = 0; i < cnt; i++, src++, pqr->p++)
637 vdev_raidz_pq_func(void *buf, size_t size, void *private)
639 struct pqr_struct *pqr = private;
640 const uint64_t *src = buf;
642 int i, cnt = size / sizeof (src[0]);
644 ASSERT(pqr->p && pqr->q && !pqr->r);
646 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
648 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
656 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
658 struct pqr_struct *pqr = private;
659 const uint64_t *src = buf;
661 int i, cnt = size / sizeof (src[0]);
663 ASSERT(pqr->p && pqr->q && pqr->r);
665 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
667 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
669 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
677 vdev_raidz_generate_parity_p(raidz_map_t *rm)
683 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
684 src = rm->rm_col[c].rc_abd;
685 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
687 if (c == rm->rm_firstdatacol) {
688 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
690 struct pqr_struct pqr = { p, NULL, NULL };
691 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
692 vdev_raidz_p_func, &pqr);
698 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
700 uint64_t *p, *q, pcnt, ccnt, mask, i;
704 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
705 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
706 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
708 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
709 src = rm->rm_col[c].rc_abd;
710 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
711 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
713 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
715 if (c == rm->rm_firstdatacol) {
716 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
717 (void) memcpy(q, p, rm->rm_col[c].rc_size);
719 struct pqr_struct pqr = { p, q, NULL };
720 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
721 vdev_raidz_pq_func, &pqr);
724 if (c == rm->rm_firstdatacol) {
725 for (i = ccnt; i < pcnt; i++) {
731 * Treat short columns as though they are full of 0s.
732 * Note that there's therefore nothing needed for P.
734 for (i = ccnt; i < pcnt; i++) {
735 VDEV_RAIDZ_64MUL_2(q[i], mask);
742 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
744 uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
748 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
749 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
750 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
751 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
752 rm->rm_col[VDEV_RAIDZ_R].rc_size);
754 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
755 src = rm->rm_col[c].rc_abd;
756 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
757 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
758 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
760 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
762 if (c == rm->rm_firstdatacol) {
763 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
764 (void) memcpy(q, p, rm->rm_col[c].rc_size);
765 (void) memcpy(r, p, rm->rm_col[c].rc_size);
767 struct pqr_struct pqr = { p, q, r };
768 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
769 vdev_raidz_pqr_func, &pqr);
772 if (c == rm->rm_firstdatacol) {
773 for (i = ccnt; i < pcnt; i++) {
780 * Treat short columns as though they are full of 0s.
781 * Note that there's therefore nothing needed for P.
783 for (i = ccnt; i < pcnt; i++) {
784 VDEV_RAIDZ_64MUL_2(q[i], mask);
785 VDEV_RAIDZ_64MUL_4(r[i], mask);
792 * Generate RAID parity in the first virtual columns according to the number of
793 * parity columns available.
796 vdev_raidz_generate_parity(raidz_map_t *rm)
798 switch (rm->rm_firstdatacol) {
800 vdev_raidz_generate_parity_p(rm);
803 vdev_raidz_generate_parity_pq(rm);
806 vdev_raidz_generate_parity_pqr(rm);
809 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
815 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
817 uint64_t *dst = dbuf;
818 uint64_t *src = sbuf;
819 int cnt = size / sizeof (src[0]);
821 for (int i = 0; i < cnt; i++) {
830 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
833 uint64_t *dst = dbuf;
834 uint64_t *src = sbuf;
836 int cnt = size / sizeof (dst[0]);
838 for (int i = 0; i < cnt; i++, dst++, src++) {
839 VDEV_RAIDZ_64MUL_2(*dst, mask);
848 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
852 int cnt = size / sizeof (dst[0]);
854 for (int i = 0; i < cnt; i++, dst++) {
855 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
856 VDEV_RAIDZ_64MUL_2(*dst, mask);
862 struct reconst_q_struct {
868 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
870 struct reconst_q_struct *rq = private;
872 int cnt = size / sizeof (dst[0]);
874 for (int i = 0; i < cnt; i++, dst++, rq->q++) {
879 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
880 *b = vdev_raidz_exp2(*b, rq->exp);
887 struct reconst_pq_struct {
897 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
899 struct reconst_pq_struct *rpq = private;
903 for (int i = 0; i < size;
904 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
905 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
906 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
907 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
914 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
916 struct reconst_pq_struct *rpq = private;
919 for (int i = 0; i < size;
920 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
921 /* same operation as vdev_raidz_reconst_pq_func() on xd */
922 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
923 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
930 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
937 ASSERT(x >= rm->rm_firstdatacol);
938 ASSERT(x < rm->rm_cols);
940 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
941 ASSERT(rm->rm_col[x].rc_size > 0);
943 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
944 dst = rm->rm_col[x].rc_abd;
946 abd_copy(dst, src, rm->rm_col[x].rc_size);
948 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
949 uint64_t size = MIN(rm->rm_col[x].rc_size,
950 rm->rm_col[c].rc_size);
952 src = rm->rm_col[c].rc_abd;
953 dst = rm->rm_col[x].rc_abd;
958 (void) abd_iterate_func2(dst, src, 0, 0, size,
959 vdev_raidz_reconst_p_func, NULL);
962 return (1 << VDEV_RAIDZ_P);
966 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
974 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
976 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
977 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
978 rm->rm_col[c].rc_size);
980 src = rm->rm_col[c].rc_abd;
981 dst = rm->rm_col[x].rc_abd;
983 if (c == rm->rm_firstdatacol) {
984 abd_copy(dst, src, size);
985 if (rm->rm_col[x].rc_size > size)
986 abd_zero_off(dst, size,
987 rm->rm_col[x].rc_size - size);
989 ASSERT3U(size, <=, rm->rm_col[x].rc_size);
990 (void) abd_iterate_func2(dst, src, 0, 0, size,
991 vdev_raidz_reconst_q_pre_func, NULL);
992 (void) abd_iterate_func(dst,
993 size, rm->rm_col[x].rc_size - size,
994 vdev_raidz_reconst_q_pre_tail_func, NULL);
998 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
999 dst = rm->rm_col[x].rc_abd;
1000 exp = 255 - (rm->rm_cols - 1 - x);
1002 struct reconst_q_struct rq = { abd_to_buf(src), exp };
1003 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1004 vdev_raidz_reconst_q_post_func, &rq);
1006 return (1 << VDEV_RAIDZ_Q);
1010 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1012 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1013 abd_t *pdata, *qdata;
1014 uint64_t xsize, ysize;
1021 ASSERT(x >= rm->rm_firstdatacol);
1022 ASSERT(y < rm->rm_cols);
1024 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1027 * Move the parity data aside -- we're going to compute parity as
1028 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1029 * reuse the parity generation mechanism without trashing the actual
1030 * parity so we make those columns appear to be full of zeros by
1031 * setting their lengths to zero.
1033 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1034 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1035 xsize = rm->rm_col[x].rc_size;
1036 ysize = rm->rm_col[y].rc_size;
1038 rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1039 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1040 rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1041 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1042 rm->rm_col[x].rc_size = 0;
1043 rm->rm_col[y].rc_size = 0;
1045 vdev_raidz_generate_parity_pq(rm);
1047 rm->rm_col[x].rc_size = xsize;
1048 rm->rm_col[y].rc_size = ysize;
1050 p = abd_to_buf(pdata);
1051 q = abd_to_buf(qdata);
1052 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1053 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1054 xd = rm->rm_col[x].rc_abd;
1055 yd = rm->rm_col[y].rc_abd;
1059 * Pxy = P + D_x + D_y
1060 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1062 * We can then solve for D_x:
1063 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1065 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1066 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1068 * With D_x in hand, we can easily solve for D_y:
1069 * D_y = P + Pxy + D_x
1072 a = vdev_raidz_pow2[255 + x - y];
1073 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1074 tmp = 255 - vdev_raidz_log2[a ^ 1];
1076 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1077 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1079 ASSERT3U(xsize, >=, ysize);
1080 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1081 (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1082 vdev_raidz_reconst_pq_func, &rpq);
1083 (void) abd_iterate_func(xd, ysize, xsize - ysize,
1084 vdev_raidz_reconst_pq_tail_func, &rpq);
1086 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1087 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1090 * Restore the saved parity data.
1092 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1093 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1095 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1100 * In the general case of reconstruction, we must solve the system of linear
1101 * equations defined by the coeffecients used to generate parity as well as
1102 * the contents of the data and parity disks. This can be expressed with
1103 * vectors for the original data (D) and the actual data (d) and parity (p)
1104 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1108 * | V | | D_0 | | p_m-1 |
1109 * | | x | : | = | d_0 |
1110 * | I | | D_n-1 | | : |
1111 * | | ~~ ~~ | d_n-1 |
1114 * I is simply a square identity matrix of size n, and V is a vandermonde
1115 * matrix defined by the coeffecients we chose for the various parity columns
1116 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1117 * computation as well as linear separability.
1120 * | 1 .. 1 1 1 | | p_0 |
1121 * | 2^n-1 .. 4 2 1 | __ __ | : |
1122 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1123 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1124 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1125 * | : : : : | | : | | d_2 |
1126 * | 0 .. 1 0 0 | | D_n-1 | | : |
1127 * | 0 .. 0 1 0 | ~~ ~~ | : |
1128 * | 0 .. 0 0 1 | | d_n-1 |
1131 * Note that I, V, d, and p are known. To compute D, we must invert the
1132 * matrix and use the known data and parity values to reconstruct the unknown
1133 * data values. We begin by removing the rows in V|I and d|p that correspond
1134 * to failed or missing columns; we then make V|I square (n x n) and d|p
1135 * sized n by removing rows corresponding to unused parity from the bottom up
1136 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1137 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1138 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1140 * | 1 1 1 1 1 1 1 1 |
1141 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1142 * | 19 205 116 29 64 16 4 1 | / /
1143 * | 1 0 0 0 0 0 0 0 | / /
1144 * | 0 1 0 0 0 0 0 0 | <--' /
1145 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1146 * | 0 0 0 1 0 0 0 0 |
1147 * | 0 0 0 0 1 0 0 0 |
1148 * | 0 0 0 0 0 1 0 0 |
1149 * | 0 0 0 0 0 0 1 0 |
1150 * | 0 0 0 0 0 0 0 1 |
1153 * | 1 1 1 1 1 1 1 1 |
1154 * | 19 205 116 29 64 16 4 1 |
1155 * | 1 0 0 0 0 0 0 0 |
1156 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1157 * | 0 0 0 0 1 0 0 0 |
1158 * | 0 0 0 0 0 1 0 0 |
1159 * | 0 0 0 0 0 0 1 0 |
1160 * | 0 0 0 0 0 0 0 1 |
1163 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1164 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1165 * matrix is not singular.
1167 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1168 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1169 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1170 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1171 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1172 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1173 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1174 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1177 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1178 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1179 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1180 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1181 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1182 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1183 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1184 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1187 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1188 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1189 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1190 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1191 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1192 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1193 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1194 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1197 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1198 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1199 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1200 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1201 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1202 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1203 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1204 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1207 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1208 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1209 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1210 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1211 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1212 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1213 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1214 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1217 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1218 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1219 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1220 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1221 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1222 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1223 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1224 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1227 * | 0 0 1 0 0 0 0 0 |
1228 * | 167 100 5 41 159 169 217 208 |
1229 * | 166 100 4 40 158 168 216 209 |
1230 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1231 * | 0 0 0 0 1 0 0 0 |
1232 * | 0 0 0 0 0 1 0 0 |
1233 * | 0 0 0 0 0 0 1 0 |
1234 * | 0 0 0 0 0 0 0 1 |
1237 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1238 * of the missing data.
1240 * As is apparent from the example above, the only non-trivial rows in the
1241 * inverse matrix correspond to the data disks that we're trying to
1242 * reconstruct. Indeed, those are the only rows we need as the others would
1243 * only be useful for reconstructing data known or assumed to be valid. For
1244 * that reason, we only build the coefficients in the rows that correspond to
1250 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1256 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1259 * Fill in the missing rows of interest.
1261 for (i = 0; i < nmap; i++) {
1262 ASSERT3S(0, <=, map[i]);
1263 ASSERT3S(map[i], <=, 2);
1270 for (j = 0; j < n; j++) {
1274 rows[i][j] = vdev_raidz_pow2[pow];
1280 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1281 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1287 * Assert that the first nmissing entries from the array of used
1288 * columns correspond to parity columns and that subsequent entries
1289 * correspond to data columns.
1291 for (i = 0; i < nmissing; i++) {
1292 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1294 for (; i < n; i++) {
1295 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1299 * First initialize the storage where we'll compute the inverse rows.
1301 for (i = 0; i < nmissing; i++) {
1302 for (j = 0; j < n; j++) {
1303 invrows[i][j] = (i == j) ? 1 : 0;
1308 * Subtract all trivial rows from the rows of consequence.
1310 for (i = 0; i < nmissing; i++) {
1311 for (j = nmissing; j < n; j++) {
1312 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1313 jj = used[j] - rm->rm_firstdatacol;
1315 invrows[i][j] = rows[i][jj];
1321 * For each of the rows of interest, we must normalize it and subtract
1322 * a multiple of it from the other rows.
1324 for (i = 0; i < nmissing; i++) {
1325 for (j = 0; j < missing[i]; j++) {
1326 ASSERT0(rows[i][j]);
1328 ASSERT3U(rows[i][missing[i]], !=, 0);
1331 * Compute the inverse of the first element and multiply each
1332 * element in the row by that value.
1334 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1336 for (j = 0; j < n; j++) {
1337 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1338 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1341 for (ii = 0; ii < nmissing; ii++) {
1345 ASSERT3U(rows[ii][missing[i]], !=, 0);
1347 log = vdev_raidz_log2[rows[ii][missing[i]]];
1349 for (j = 0; j < n; j++) {
1351 vdev_raidz_exp2(rows[i][j], log);
1353 vdev_raidz_exp2(invrows[i][j], log);
1359 * Verify that the data that is left in the rows are properly part of
1360 * an identity matrix.
1362 for (i = 0; i < nmissing; i++) {
1363 for (j = 0; j < n; j++) {
1364 if (j == missing[i]) {
1365 ASSERT3U(rows[i][j], ==, 1);
1367 ASSERT0(rows[i][j]);
1374 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1375 int *missing, uint8_t **invrows, const uint8_t *used)
1380 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1381 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1385 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1389 psize = sizeof (invlog[0][0]) * n * nmissing;
1390 p = kmem_alloc(psize, KM_SLEEP);
1392 for (pp = p, i = 0; i < nmissing; i++) {
1397 for (i = 0; i < nmissing; i++) {
1398 for (j = 0; j < n; j++) {
1399 ASSERT3U(invrows[i][j], !=, 0);
1400 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1404 for (i = 0; i < n; i++) {
1406 ASSERT3U(c, <, rm->rm_cols);
1408 src = abd_to_buf(rm->rm_col[c].rc_abd);
1409 ccount = rm->rm_col[c].rc_size;
1410 for (j = 0; j < nmissing; j++) {
1411 cc = missing[j] + rm->rm_firstdatacol;
1412 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1413 ASSERT3U(cc, <, rm->rm_cols);
1414 ASSERT3U(cc, !=, c);
1416 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1417 dcount[j] = rm->rm_col[cc].rc_size;
1420 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1422 for (x = 0; x < ccount; x++, src++) {
1424 log = vdev_raidz_log2[*src];
1426 for (cc = 0; cc < nmissing; cc++) {
1427 if (x >= dcount[cc])
1433 if ((ll = log + invlog[cc][i]) >= 255)
1435 val = vdev_raidz_pow2[ll];
1446 kmem_free(p, psize);
1450 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1454 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1455 int parity_map[VDEV_RAIDZ_MAXPARITY];
1460 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1461 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1464 abd_t **bufs = NULL;
1469 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1470 * temporary linear ABDs.
1472 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1473 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1475 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1476 raidz_col_t *col = &rm->rm_col[c];
1478 bufs[c] = col->rc_abd;
1479 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1480 abd_copy(col->rc_abd, bufs[c], col->rc_size);
1484 n = rm->rm_cols - rm->rm_firstdatacol;
1487 * Figure out which data columns are missing.
1490 for (t = 0; t < ntgts; t++) {
1491 if (tgts[t] >= rm->rm_firstdatacol) {
1492 missing_rows[nmissing_rows++] =
1493 tgts[t] - rm->rm_firstdatacol;
1498 * Figure out which parity columns to use to help generate the missing
1501 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1503 ASSERT(c < rm->rm_firstdatacol);
1506 * Skip any targeted parity columns.
1508 if (c == tgts[tt]) {
1520 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1522 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1523 nmissing_rows * n + sizeof (used[0]) * n;
1524 p = kmem_alloc(psize, KM_SLEEP);
1526 for (pp = p, i = 0; i < nmissing_rows; i++) {
1534 for (i = 0; i < nmissing_rows; i++) {
1535 used[i] = parity_map[i];
1538 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1539 if (tt < nmissing_rows &&
1540 c == missing_rows[tt] + rm->rm_firstdatacol) {
1551 * Initialize the interesting rows of the matrix.
1553 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1556 * Invert the matrix.
1558 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1562 * Reconstruct the missing data using the generated matrix.
1564 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1567 kmem_free(p, psize);
1570 * copy back from temporary linear abds and free them
1573 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1574 raidz_col_t *col = &rm->rm_col[c];
1576 abd_copy(bufs[c], col->rc_abd, col->rc_size);
1577 abd_free(col->rc_abd);
1578 col->rc_abd = bufs[c];
1580 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1587 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1589 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1593 int nbadparity, nbaddata;
1594 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1597 * The tgts list must already be sorted.
1599 for (i = 1; i < nt; i++) {
1600 ASSERT(t[i] > t[i - 1]);
1603 nbadparity = rm->rm_firstdatacol;
1604 nbaddata = rm->rm_cols - nbadparity;
1606 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1607 if (c < rm->rm_firstdatacol)
1608 parity_valid[c] = B_FALSE;
1610 if (i < nt && c == t[i]) {
1613 } else if (rm->rm_col[c].rc_error != 0) {
1615 } else if (c >= rm->rm_firstdatacol) {
1618 parity_valid[c] = B_TRUE;
1623 ASSERT(ntgts >= nt);
1624 ASSERT(nbaddata >= 0);
1625 ASSERT(nbaddata + nbadparity == ntgts);
1627 dt = &tgts[nbadparity];
1630 * See if we can use any of our optimized reconstruction routines.
1632 if (!vdev_raidz_default_to_general) {
1635 if (parity_valid[VDEV_RAIDZ_P])
1636 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1638 ASSERT(rm->rm_firstdatacol > 1);
1640 if (parity_valid[VDEV_RAIDZ_Q])
1641 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1643 ASSERT(rm->rm_firstdatacol > 2);
1647 ASSERT(rm->rm_firstdatacol > 1);
1649 if (parity_valid[VDEV_RAIDZ_P] &&
1650 parity_valid[VDEV_RAIDZ_Q])
1651 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1653 ASSERT(rm->rm_firstdatacol > 2);
1659 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1660 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1666 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1667 uint64_t *logical_ashift, uint64_t *physical_ashift)
1670 uint64_t nparity = vd->vdev_nparity;
1675 ASSERT(nparity > 0);
1677 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1678 vd->vdev_children < nparity + 1) {
1679 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1680 return (SET_ERROR(EINVAL));
1683 vdev_open_children(vd);
1685 for (c = 0; c < vd->vdev_children; c++) {
1686 cvd = vd->vdev_child[c];
1688 if (cvd->vdev_open_error != 0) {
1689 lasterror = cvd->vdev_open_error;
1694 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1695 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1696 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1697 *physical_ashift = MAX(*physical_ashift,
1698 cvd->vdev_physical_ashift);
1701 *asize *= vd->vdev_children;
1702 *max_asize *= vd->vdev_children;
1704 if (numerrors > nparity) {
1705 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1713 vdev_raidz_close(vdev_t *vd)
1717 for (c = 0; c < vd->vdev_children; c++)
1718 vdev_close(vd->vdev_child[c]);
1723 * Handle a read or write I/O to a RAID-Z dump device.
1725 * The dump device is in a unique situation compared to other ZFS datasets:
1726 * writing to this device should be as simple and fast as possible. In
1727 * addition, durability matters much less since the dump will be extracted
1728 * once the machine reboots. For that reason, this function eschews parity for
1729 * performance and simplicity. The dump device uses the checksum setting
1730 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1733 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1734 * 128 KB will not fill an entire block; in addition, they may not be properly
1735 * aligned. In that case, this function uses the preallocated 128 KB block and
1736 * omits reading or writing any "empty" portions of that block, as opposed to
1737 * allocating a fresh appropriately-sized block.
1739 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1741 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1743 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1744 * allocated which spans all five child vdevs. 8 KB of data would be written to
1745 * each of four vdevs, with the fifth containing the parity bits.
1747 * parity data data data data
1748 * | PP | XX | XX | XX | XX |
1751 * 8 KB parity ------8 KB data blocks------
1753 * However, when writing to the dump device, the behavior is different:
1755 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1757 * Unlike the normal RAID-Z case in which the block is allocated based on the
1758 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1759 * I/O size is less than 128 KB, only the actual portions of data are written.
1760 * In this example the data is written to the third data vdev since that vdev
1761 * contains the offset [64 KB, 96 KB).
1763 * parity data data data data
1769 * As a result, an individual I/O may not span all child vdevs; moreover, a
1770 * small I/O may only operate on a single child vdev.
1772 * Note that since there are no parity bits calculated or written, this format
1773 * remains the same no matter how many parity bits are used in a normal RAID-Z
1774 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1777 * parity parity parity data data data data
1778 * | | | | | | XX | |
1784 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1785 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1787 vdev_t *tvd = vd->vdev_top;
1793 uint64_t start, end, colstart, colend;
1794 uint64_t coloffset, colsize, colskip;
1796 int flags = doread ? BIO_READ : BIO_WRITE;
1801 * Don't write past the end of the block
1803 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1809 * Allocate a RAID-Z map for this block. Note that this block starts
1810 * from the "original" offset, this is, the offset of the extent which
1811 * contains the requisite offset of the data being read or written.
1813 * Even if this I/O operation doesn't span the full block size, let's
1814 * treat the on-disk format as if the only blocks are the complete 128
1817 abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1818 SPA_OLD_MAXBLOCKSIZE);
1819 rm = vdev_raidz_map_alloc(abd,
1820 SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1821 vd->vdev_children, vd->vdev_nparity);
1823 coloffset = origoffset;
1825 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1826 c++, coloffset += rc->rc_size) {
1827 rc = &rm->rm_col[c];
1828 cvd = vd->vdev_child[rc->rc_devidx];
1831 * Find the start and end of this column in the RAID-Z map,
1832 * keeping in mind that the stated size and offset of the
1833 * operation may not fill the entire column for this vdev.
1835 * If any portion of the data spans this column, issue the
1836 * appropriate operation to the vdev.
1838 if (coloffset + rc->rc_size <= start)
1840 if (coloffset >= end)
1843 colstart = MAX(coloffset, start);
1844 colend = MIN(end, coloffset + rc->rc_size);
1845 colsize = colend - colstart;
1846 colskip = colstart - coloffset;
1848 VERIFY3U(colsize, <=, rc->rc_size);
1849 VERIFY3U(colskip, <=, rc->rc_size);
1852 * Note that the child vdev will have a vdev label at the start
1853 * of its range of offsets, hence the need for
1854 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1855 * example of why this calculation is needed.
1857 if ((err = vdev_disk_physio(cvd,
1858 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1859 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1860 flags, isdump)) != 0)
1864 vdev_raidz_map_free(rm);
1873 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1876 uint64_t ashift = vd->vdev_top->vdev_ashift;
1877 uint64_t cols = vd->vdev_children;
1878 uint64_t nparity = vd->vdev_nparity;
1880 asize = ((psize - 1) >> ashift) + 1;
1881 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1882 asize = roundup(asize, nparity + 1) << ashift;
1888 vdev_raidz_child_done(zio_t *zio)
1890 raidz_col_t *rc = zio->io_private;
1892 rc->rc_error = zio->io_error;
1898 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1901 vdev_t *vd = zio->io_vd;
1902 vdev_t *tvd = vd->vdev_top;
1904 range_seg_t logical_rs, physical_rs;
1905 logical_rs.rs_start = zio->io_offset;
1906 logical_rs.rs_end = logical_rs.rs_start +
1907 vdev_raidz_asize(zio->io_vd, zio->io_size);
1909 raidz_col_t *rc = &rm->rm_col[col];
1910 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1912 vdev_xlate(cvd, &logical_rs, &physical_rs);
1913 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1914 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1916 * It would be nice to assert that rs_end is equal
1917 * to rc_offset + rc_size but there might be an
1918 * optional I/O at the end that is not accounted in
1921 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1922 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1923 rc->rc_size + (1 << tvd->vdev_ashift));
1925 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1931 * Start an IO operation on a RAIDZ VDev
1934 * - For write operations:
1935 * 1. Generate the parity data
1936 * 2. Create child zio write operations to each column's vdev, for both
1938 * 3. If the column skips any sectors for padding, create optional dummy
1939 * write zio children for those areas to improve aggregation continuity.
1940 * - For read operations:
1941 * 1. Create child zio read operations to each data column's vdev to read
1942 * the range of data required for zio.
1943 * 2. If this is a scrub or resilver operation, or if any of the data
1944 * vdevs have had errors, then create zio read operations to the parity
1945 * columns' VDevs as well.
1948 vdev_raidz_io_start(zio_t *zio)
1950 vdev_t *vd = zio->io_vd;
1951 vdev_t *tvd = vd->vdev_top;
1957 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1958 zio->io_type == ZIO_TYPE_FREE,
1959 tvd->vdev_ashift, vd->vdev_children,
1963 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1965 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1967 if (zio->io_type == ZIO_TYPE_FREE) {
1968 for (c = 0; c < rm->rm_cols; c++) {
1969 rc = &rm->rm_col[c];
1970 cvd = vd->vdev_child[rc->rc_devidx];
1971 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1972 rc->rc_offset, rc->rc_abd, rc->rc_size,
1973 zio->io_type, zio->io_priority, 0,
1974 vdev_raidz_child_done, rc));
1981 if (zio->io_type == ZIO_TYPE_WRITE) {
1982 vdev_raidz_generate_parity(rm);
1984 for (c = 0; c < rm->rm_cols; c++) {
1985 rc = &rm->rm_col[c];
1986 cvd = vd->vdev_child[rc->rc_devidx];
1989 * Verify physical to logical translation.
1991 vdev_raidz_io_verify(zio, rm, c);
1993 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1994 rc->rc_offset, rc->rc_abd, rc->rc_size,
1995 zio->io_type, zio->io_priority, 0,
1996 vdev_raidz_child_done, rc));
2000 * Generate optional I/Os for any skipped sectors to improve
2001 * aggregation contiguity.
2003 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
2004 ASSERT(c <= rm->rm_scols);
2005 if (c == rm->rm_scols)
2007 rc = &rm->rm_col[c];
2008 cvd = vd->vdev_child[rc->rc_devidx];
2009 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2010 rc->rc_offset + rc->rc_size, NULL,
2011 1 << tvd->vdev_ashift,
2012 zio->io_type, zio->io_priority,
2013 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2020 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2023 * Iterate over the columns in reverse order so that we hit the parity
2024 * last -- any errors along the way will force us to read the parity.
2026 for (c = rm->rm_cols - 1; c >= 0; c--) {
2027 rc = &rm->rm_col[c];
2028 cvd = vd->vdev_child[rc->rc_devidx];
2029 if (!vdev_readable(cvd)) {
2030 if (c >= rm->rm_firstdatacol)
2031 rm->rm_missingdata++;
2033 rm->rm_missingparity++;
2034 rc->rc_error = SET_ERROR(ENXIO);
2035 rc->rc_tried = 1; /* don't even try */
2039 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2040 if (c >= rm->rm_firstdatacol)
2041 rm->rm_missingdata++;
2043 rm->rm_missingparity++;
2044 rc->rc_error = SET_ERROR(ESTALE);
2048 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2049 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2050 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2051 rc->rc_offset, rc->rc_abd, rc->rc_size,
2052 zio->io_type, zio->io_priority, 0,
2053 vdev_raidz_child_done, rc));
2062 * Report a checksum error for a child of a RAID-Z device.
2065 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2068 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2070 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2071 zio_bad_cksum_t zbc;
2072 raidz_map_t *rm = zio->io_vsd;
2074 mutex_enter(&vd->vdev_stat_lock);
2075 vd->vdev_stat.vs_checksum_errors++;
2076 mutex_exit(&vd->vdev_stat_lock);
2078 zbc.zbc_has_cksum = 0;
2079 zbc.zbc_injected = rm->rm_ecksuminjected;
2081 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2082 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2083 rc->rc_offset, rc->rc_size, buf, bad_data,
2085 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2090 * We keep track of whether or not there were any injected errors, so that
2091 * any ereports we generate can note it.
2094 raidz_checksum_verify(zio_t *zio)
2096 zio_bad_cksum_t zbc;
2097 raidz_map_t *rm = zio->io_vsd;
2099 int ret = zio_checksum_error(zio, &zbc);
2100 if (ret != 0 && zbc.zbc_injected != 0)
2101 rm->rm_ecksuminjected = 1;
2107 * Generate the parity from the data columns. If we tried and were able to
2108 * read the parity without error, verify that the generated parity matches the
2109 * data we read. If it doesn't, we fire off a checksum error. Return the
2110 * number such failures.
2113 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2115 void *orig[VDEV_RAIDZ_MAXPARITY];
2119 blkptr_t *bp = zio->io_bp;
2120 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2121 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2123 if (checksum == ZIO_CHECKSUM_NOPARITY)
2126 for (c = 0; c < rm->rm_firstdatacol; c++) {
2127 rc = &rm->rm_col[c];
2128 if (!rc->rc_tried || rc->rc_error != 0)
2130 orig[c] = zio_buf_alloc(rc->rc_size);
2131 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2134 vdev_raidz_generate_parity(rm);
2136 for (c = 0; c < rm->rm_firstdatacol; c++) {
2137 rc = &rm->rm_col[c];
2138 if (!rc->rc_tried || rc->rc_error != 0)
2140 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2141 raidz_checksum_error(zio, rc, orig[c]);
2142 rc->rc_error = SET_ERROR(ECKSUM);
2145 zio_buf_free(orig[c], rc->rc_size);
2152 * Keep statistics on all the ways that we used parity to correct data.
2154 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2157 vdev_raidz_worst_error(raidz_map_t *rm)
2161 for (int c = 0; c < rm->rm_cols; c++)
2162 error = zio_worst_error(error, rm->rm_col[c].rc_error);
2168 * Iterate over all combinations of bad data and attempt a reconstruction.
2169 * Note that the algorithm below is non-optimal because it doesn't take into
2170 * account how reconstruction is actually performed. For example, with
2171 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2172 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2173 * cases we'd only use parity information in column 0.
2176 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2178 raidz_map_t *rm = zio->io_vsd;
2180 void *orig[VDEV_RAIDZ_MAXPARITY];
2181 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2182 int *tgts = &tstore[1];
2183 int current, next, i, c, n;
2186 ASSERT(total_errors < rm->rm_firstdatacol);
2189 * This simplifies one edge condition.
2193 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2195 * Initialize the targets array by finding the first n columns
2196 * that contain no error.
2198 * If there were no data errors, we need to ensure that we're
2199 * always explicitly attempting to reconstruct at least one
2200 * data column. To do this, we simply push the highest target
2201 * up into the data columns.
2203 for (c = 0, i = 0; i < n; i++) {
2204 if (i == n - 1 && data_errors == 0 &&
2205 c < rm->rm_firstdatacol) {
2206 c = rm->rm_firstdatacol;
2209 while (rm->rm_col[c].rc_error != 0) {
2211 ASSERT3S(c, <, rm->rm_cols);
2218 * Setting tgts[n] simplifies the other edge condition.
2220 tgts[n] = rm->rm_cols;
2223 * These buffers were allocated in previous iterations.
2225 for (i = 0; i < n - 1; i++) {
2226 ASSERT(orig[i] != NULL);
2229 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2232 next = tgts[current];
2234 while (current != n) {
2235 tgts[current] = next;
2239 * Save off the original data that we're going to
2240 * attempt to reconstruct.
2242 for (i = 0; i < n; i++) {
2243 ASSERT(orig[i] != NULL);
2246 ASSERT3S(c, <, rm->rm_cols);
2247 rc = &rm->rm_col[c];
2248 abd_copy_to_buf(orig[i], rc->rc_abd,
2253 * Attempt a reconstruction and exit the outer loop on
2256 code = vdev_raidz_reconstruct(rm, tgts, n);
2257 if (raidz_checksum_verify(zio) == 0) {
2258 atomic_inc_64(&raidz_corrected[code]);
2260 for (i = 0; i < n; i++) {
2262 rc = &rm->rm_col[c];
2263 ASSERT(rc->rc_error == 0);
2265 raidz_checksum_error(zio, rc,
2267 rc->rc_error = SET_ERROR(ECKSUM);
2275 * Restore the original data.
2277 for (i = 0; i < n; i++) {
2279 rc = &rm->rm_col[c];
2280 abd_copy_from_buf(rc->rc_abd, orig[i],
2286 * Find the next valid column after the current
2289 for (next = tgts[current] + 1;
2290 next < rm->rm_cols &&
2291 rm->rm_col[next].rc_error != 0; next++)
2294 ASSERT(next <= tgts[current + 1]);
2297 * If that spot is available, we're done here.
2299 if (next != tgts[current + 1])
2303 * Otherwise, find the next valid column after
2304 * the previous position.
2306 for (c = tgts[current - 1] + 1;
2307 rm->rm_col[c].rc_error != 0; c++)
2313 } while (current != n);
2318 for (i = 0; i < n; i++) {
2319 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2326 * Complete an IO operation on a RAIDZ VDev
2329 * - For write operations:
2330 * 1. Check for errors on the child IOs.
2331 * 2. Return, setting an error code if too few child VDevs were written
2332 * to reconstruct the data later. Note that partial writes are
2333 * considered successful if they can be reconstructed at all.
2334 * - For read operations:
2335 * 1. Check for errors on the child IOs.
2336 * 2. If data errors occurred:
2337 * a. Try to reassemble the data from the parity available.
2338 * b. If we haven't yet read the parity drives, read them now.
2339 * c. If all parity drives have been read but the data still doesn't
2340 * reassemble with a correct checksum, then try combinatorial
2342 * d. If that doesn't work, return an error.
2343 * 3. If there were unexpected errors or this is a resilver operation,
2344 * rewrite the vdevs that had errors.
2347 vdev_raidz_io_done(zio_t *zio)
2349 vdev_t *vd = zio->io_vd;
2351 raidz_map_t *rm = zio->io_vsd;
2353 int unexpected_errors = 0;
2354 int parity_errors = 0;
2355 int parity_untried = 0;
2356 int data_errors = 0;
2357 int total_errors = 0;
2359 int tgts[VDEV_RAIDZ_MAXPARITY];
2362 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2364 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2365 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2367 for (c = 0; c < rm->rm_cols; c++) {
2368 rc = &rm->rm_col[c];
2371 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2373 if (c < rm->rm_firstdatacol)
2378 if (!rc->rc_skipped)
2379 unexpected_errors++;
2382 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2387 if (zio->io_type == ZIO_TYPE_WRITE) {
2389 * XXX -- for now, treat partial writes as a success.
2390 * (If we couldn't write enough columns to reconstruct
2391 * the data, the I/O failed. Otherwise, good enough.)
2393 * Now that we support write reallocation, it would be better
2394 * to treat partial failure as real failure unless there are
2395 * no non-degraded top-level vdevs left, and not update DTLs
2396 * if we intend to reallocate.
2399 if (total_errors > rm->rm_firstdatacol)
2400 zio->io_error = vdev_raidz_worst_error(rm);
2403 } else if (zio->io_type == ZIO_TYPE_FREE) {
2407 ASSERT(zio->io_type == ZIO_TYPE_READ);
2409 * There are three potential phases for a read:
2410 * 1. produce valid data from the columns read
2411 * 2. read all disks and try again
2412 * 3. perform combinatorial reconstruction
2414 * Each phase is progressively both more expensive and less likely to
2415 * occur. If we encounter more errors than we can repair or all phases
2416 * fail, we have no choice but to return an error.
2420 * If the number of errors we saw was correctable -- less than or equal
2421 * to the number of parity disks read -- attempt to produce data that
2422 * has a valid checksum. Naturally, this case applies in the absence of
2425 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2426 if (data_errors == 0) {
2427 if (raidz_checksum_verify(zio) == 0) {
2429 * If we read parity information (unnecessarily
2430 * as it happens since no reconstruction was
2431 * needed) regenerate and verify the parity.
2432 * We also regenerate parity when resilvering
2433 * so we can write it out to the failed device
2436 if (parity_errors + parity_untried <
2437 rm->rm_firstdatacol ||
2438 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2439 n = raidz_parity_verify(zio, rm);
2440 unexpected_errors += n;
2441 ASSERT(parity_errors + n <=
2442 rm->rm_firstdatacol);
2448 * We either attempt to read all the parity columns or
2449 * none of them. If we didn't try to read parity, we
2450 * wouldn't be here in the correctable case. There must
2451 * also have been fewer parity errors than parity
2452 * columns or, again, we wouldn't be in this code path.
2454 ASSERT(parity_untried == 0);
2455 ASSERT(parity_errors < rm->rm_firstdatacol);
2458 * Identify the data columns that reported an error.
2461 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2462 rc = &rm->rm_col[c];
2463 if (rc->rc_error != 0) {
2464 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2469 ASSERT(rm->rm_firstdatacol >= n);
2471 code = vdev_raidz_reconstruct(rm, tgts, n);
2473 if (raidz_checksum_verify(zio) == 0) {
2474 atomic_inc_64(&raidz_corrected[code]);
2477 * If we read more parity disks than were used
2478 * for reconstruction, confirm that the other
2479 * parity disks produced correct data. This
2480 * routine is suboptimal in that it regenerates
2481 * the parity that we already used in addition
2482 * to the parity that we're attempting to
2483 * verify, but this should be a relatively
2484 * uncommon case, and can be optimized if it
2485 * becomes a problem. Note that we regenerate
2486 * parity when resilvering so we can write it
2487 * out to failed devices later.
2489 if (parity_errors < rm->rm_firstdatacol - n ||
2490 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2491 n = raidz_parity_verify(zio, rm);
2492 unexpected_errors += n;
2493 ASSERT(parity_errors + n <=
2494 rm->rm_firstdatacol);
2503 * This isn't a typical situation -- either we got a read error or
2504 * a child silently returned bad data. Read every block so we can
2505 * try again with as much data and parity as we can track down. If
2506 * we've already been through once before, all children will be marked
2507 * as tried so we'll proceed to combinatorial reconstruction.
2509 unexpected_errors = 1;
2510 rm->rm_missingdata = 0;
2511 rm->rm_missingparity = 0;
2513 for (c = 0; c < rm->rm_cols; c++) {
2514 if (rm->rm_col[c].rc_tried)
2517 zio_vdev_io_redone(zio);
2519 rc = &rm->rm_col[c];
2522 zio_nowait(zio_vdev_child_io(zio, NULL,
2523 vd->vdev_child[rc->rc_devidx],
2524 rc->rc_offset, rc->rc_abd, rc->rc_size,
2525 zio->io_type, zio->io_priority, 0,
2526 vdev_raidz_child_done, rc));
2527 } while (++c < rm->rm_cols);
2533 * At this point we've attempted to reconstruct the data given the
2534 * errors we detected, and we've attempted to read all columns. There
2535 * must, therefore, be one or more additional problems -- silent errors
2536 * resulting in invalid data rather than explicit I/O errors resulting
2537 * in absent data. We check if there is enough additional data to
2538 * possibly reconstruct the data and then perform combinatorial
2539 * reconstruction over all possible combinations. If that fails,
2542 if (total_errors > rm->rm_firstdatacol) {
2543 zio->io_error = vdev_raidz_worst_error(rm);
2545 } else if (total_errors < rm->rm_firstdatacol &&
2546 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2548 * If we didn't use all the available parity for the
2549 * combinatorial reconstruction, verify that the remaining
2550 * parity is correct.
2552 if (code != (1 << rm->rm_firstdatacol) - 1)
2553 (void) raidz_parity_verify(zio, rm);
2556 * We're here because either:
2558 * total_errors == rm_firstdatacol, or
2559 * vdev_raidz_combrec() failed
2561 * In either case, there is enough bad data to prevent
2564 * Start checksum ereports for all children which haven't
2565 * failed, and the IO wasn't speculative.
2567 zio->io_error = SET_ERROR(ECKSUM);
2569 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2570 for (c = 0; c < rm->rm_cols; c++) {
2571 rc = &rm->rm_col[c];
2572 if (rc->rc_error == 0) {
2573 zio_bad_cksum_t zbc;
2574 zbc.zbc_has_cksum = 0;
2576 rm->rm_ecksuminjected;
2578 zfs_ereport_start_checksum(
2580 vd->vdev_child[rc->rc_devidx],
2581 zio, rc->rc_offset, rc->rc_size,
2582 (void *)(uintptr_t)c, &zbc);
2589 zio_checksum_verified(zio);
2591 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2592 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2594 * Use the good data we have in hand to repair damaged children.
2596 for (c = 0; c < rm->rm_cols; c++) {
2597 rc = &rm->rm_col[c];
2598 cvd = vd->vdev_child[rc->rc_devidx];
2600 if (rc->rc_error == 0)
2603 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2604 rc->rc_offset, rc->rc_abd, rc->rc_size,
2605 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2606 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2607 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2613 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2615 if (faulted > vd->vdev_nparity)
2616 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2617 VDEV_AUX_NO_REPLICAS);
2618 else if (degraded + faulted != 0)
2619 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2621 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2625 * Determine if any portion of the provided block resides on a child vdev
2626 * with a dirty DTL and therefore needs to be resilvered. The function
2627 * assumes that at least one DTL is dirty which imples that full stripe
2628 * width blocks must be resilvered.
2631 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2633 uint64_t dcols = vd->vdev_children;
2634 uint64_t nparity = vd->vdev_nparity;
2635 uint64_t ashift = vd->vdev_top->vdev_ashift;
2636 /* The starting RAIDZ (parent) vdev sector of the block. */
2637 uint64_t b = offset >> ashift;
2638 /* The zio's size in units of the vdev's minimum sector size. */
2639 uint64_t s = ((psize - 1) >> ashift) + 1;
2640 /* The first column for this stripe. */
2641 uint64_t f = b % dcols;
2643 if (s + nparity >= dcols)
2646 for (uint64_t c = 0; c < s + nparity; c++) {
2647 uint64_t devidx = (f + c) % dcols;
2648 vdev_t *cvd = vd->vdev_child[devidx];
2651 * dsl_scan_need_resilver() already checked vd with
2652 * vdev_dtl_contains(). So here just check cvd with
2653 * vdev_dtl_empty(), cheaper and a good approximation.
2655 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2663 vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
2665 vdev_t *raidvd = cvd->vdev_parent;
2666 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2668 uint64_t width = raidvd->vdev_children;
2669 uint64_t tgt_col = cvd->vdev_id;
2670 uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2672 /* make sure the offsets are block-aligned */
2673 ASSERT0(in->rs_start % (1 << ashift));
2674 ASSERT0(in->rs_end % (1 << ashift));
2675 uint64_t b_start = in->rs_start >> ashift;
2676 uint64_t b_end = in->rs_end >> ashift;
2678 uint64_t start_row = 0;
2679 if (b_start > tgt_col) /* avoid underflow */
2680 start_row = ((b_start - tgt_col - 1) / width) + 1;
2682 uint64_t end_row = 0;
2683 if (b_end > tgt_col)
2684 end_row = ((b_end - tgt_col - 1) / width) + 1;
2686 res->rs_start = start_row << ashift;
2687 res->rs_end = end_row << ashift;
2689 ASSERT3U(res->rs_start, <=, in->rs_start);
2690 ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2693 vdev_ops_t vdev_raidz_ops = {
2697 vdev_raidz_io_start,
2699 vdev_raidz_state_change,
2700 vdev_raidz_need_resilver,
2705 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2706 B_FALSE /* not a leaf vdev */