Illumos #3006
[zfs.git] / module / zfs / vdev_raidz.c
index 69e3144..3e1878d 100644 (file)
@@ -20,8 +20,8 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 /*
  * Virtual device vector for RAID-Z.
  *
- * This vdev supports both single and double parity. For single parity, we
- * use a simple XOR of all the data columns. For double parity, we use both
- * the simple XOR as well as a technique described in "The mathematics of
- * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
- * over the integers expressable in a single byte. Briefly, the operations on
- * the field are defined as follows:
+ * This vdev supports single, double, and triple parity. For single parity,
+ * we use a simple XOR of all the data columns. For double or triple parity,
+ * we use a special case of Reed-Solomon coding. This extends the
+ * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
+ * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
+ * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
+ * former is also based. The latter is designed to provide higher performance
+ * for writes.
+ *
+ * Note that the Plank paper claimed to support arbitrary N+M, but was then
+ * amended six years later identifying a critical flaw that invalidates its
+ * claims. Nevertheless, the technique can be adapted to work for up to
+ * triple parity. For additional parity, the amendment "Note: Correction to
+ * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
+ * is viable, but the additional complexity means that write performance will
+ * suffer.
+ *
+ * All of the methods above operate on a Galois field, defined over the
+ * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
+ * can be expressed with a single byte. Briefly, the operations on the
+ * field are defined as follows:
  *
  *   o addition (+) is represented by a bitwise XOR
  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  *     (A * 2)_0 = A_7
  *
  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ * As an aside, this multiplication is derived from the error correcting
+ * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  *
  * Observe that any number in the field (except for 0) can be expressed as a
  * power of 2 -- a generator for the field. We store a table of the powers of
  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
- * than field addition). The inverse of a field element A (A^-1) is A^254.
+ * than field addition). The inverse of a field element A (A^-1) is therefore
+ * A ^ (255 - 1) = A^254.
  *
- * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
- * can be expressed by field operations:
+ * The up-to-three parity columns, P, Q, R over several data columns,
+ * D_0, ... D_n-1, can be expressed by field operations:
  *
  *     P = D_0 + D_1 + ... + D_n-2 + D_n-1
  *     Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  *       = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ *     R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
+ *       = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  *
- * See the reconstruction code below for how P and Q can used individually or
- * in concert to recover missing data columns.
+ * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
+ * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
+ * independent coefficients. (There are no additional coefficients that have
+ * this property which is why the uncorrected Plank method breaks down.)
+ *
+ * See the reconstruction code below for how P, Q and R can used individually
+ * or in concert to recover missing data columns.
  */
 
 typedef struct raidz_col {
@@ -78,27 +103,60 @@ typedef struct raidz_col {
        uint64_t rc_offset;             /* device offset */
        uint64_t rc_size;               /* I/O size */
        void *rc_data;                  /* I/O data */
+       void *rc_gdata;                 /* used to store the "good" version */
        int rc_error;                   /* I/O error for this device */
        uint8_t rc_tried;               /* Did we attempt this I/O column? */
        uint8_t rc_skipped;             /* Did we skip this I/O column? */
 } raidz_col_t;
 
 typedef struct raidz_map {
-       uint64_t rm_cols;               /* Column count */
+       uint64_t rm_cols;               /* Regular column count */
+       uint64_t rm_scols;              /* Count including skipped columns */
        uint64_t rm_bigcols;            /* Number of oversized columns */
        uint64_t rm_asize;              /* Actual total I/O size */
        uint64_t rm_missingdata;        /* Count of missing data devices */
        uint64_t rm_missingparity;      /* Count of missing parity devices */
        uint64_t rm_firstdatacol;       /* First data column/parity count */
+       uint64_t rm_nskip;              /* Skipped sectors for padding */
+       uint64_t rm_skipstart;  /* Column index of padding start */
+       void *rm_datacopy;              /* rm_asize-buffer of copied data */
+       uintptr_t rm_reports;           /* # of referencing checksum reports */
+       uint8_t rm_freed;               /* map no longer has referencing ZIO */
+       uint8_t rm_ecksuminjected;      /* checksum error was injected */
        raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 } raidz_map_t;
 
 #define        VDEV_RAIDZ_P            0
 #define        VDEV_RAIDZ_Q            1
+#define        VDEV_RAIDZ_R            2
+
+#define        VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
+#define        VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
+
+/*
+ * We provide a mechanism to perform the field multiplication operation on a
+ * 64-bit value all at once rather than a byte at a time. This works by
+ * creating a mask from the top bit in each byte and using that to
+ * conditionally apply the XOR of 0x1d.
+ */
+#define        VDEV_RAIDZ_64MUL_2(x, mask) \
+{ \
+       (mask) = (x) & 0x8080808080808080ULL; \
+       (mask) = ((mask) << 1) - ((mask) >> 7); \
+       (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
+           ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
+}
 
-#define        VDEV_RAIDZ_MAXPARITY    2
+#define        VDEV_RAIDZ_64MUL_4(x, mask) \
+{ \
+       VDEV_RAIDZ_64MUL_2((x), mask); \
+       VDEV_RAIDZ_64MUL_2((x), mask); \
+}
 
-#define        VDEV_RAIDZ_MUL_2(a)     (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
+/*
+ * Force reconstruction to use the general purpose method.
+ */
+int vdev_raidz_default_to_general;
 
 /*
  * These two tables represent powers and logs of 2 in the Galois field defined
@@ -173,6 +231,8 @@ static const uint8_t vdev_raidz_log2[256] = {
        0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 };
 
+static void vdev_raidz_generate_parity(raidz_map_t *rm);
+
 /*
  * Multiply a given number by 2 raised to the given power.
  */
@@ -193,17 +253,184 @@ vdev_raidz_exp2(uint_t a, int exp)
 }
 
 static void
-vdev_raidz_map_free(zio_t *zio)
+vdev_raidz_map_free(raidz_map_t *rm)
 {
-       raidz_map_t *rm = zio->io_vsd;
        int c;
+       size_t size;
 
-       for (c = 0; c < rm->rm_firstdatacol; c++)
+       for (c = 0; c < rm->rm_firstdatacol; c++) {
                zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
 
-       kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+               if (rm->rm_col[c].rc_gdata != NULL)
+                       zio_buf_free(rm->rm_col[c].rc_gdata,
+                           rm->rm_col[c].rc_size);
+       }
+
+       size = 0;
+       for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+               size += rm->rm_col[c].rc_size;
+
+       if (rm->rm_datacopy != NULL)
+               zio_buf_free(rm->rm_datacopy, size);
+
+       kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
+}
+
+static void
+vdev_raidz_map_free_vsd(zio_t *zio)
+{
+       raidz_map_t *rm = zio->io_vsd;
+
+       ASSERT0(rm->rm_freed);
+       rm->rm_freed = 1;
+
+       if (rm->rm_reports == 0)
+               vdev_raidz_map_free(rm);
+}
+
+/*ARGSUSED*/
+static void
+vdev_raidz_cksum_free(void *arg, size_t ignored)
+{
+       raidz_map_t *rm = arg;
+
+       ASSERT3U(rm->rm_reports, >, 0);
+
+       if (--rm->rm_reports == 0 && rm->rm_freed != 0)
+               vdev_raidz_map_free(rm);
+}
+
+static void
+vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
+{
+       raidz_map_t *rm = zcr->zcr_cbdata;
+       size_t c = zcr->zcr_cbinfo;
+       size_t x;
+
+       const char *good = NULL;
+       const char *bad = rm->rm_col[c].rc_data;
+
+       if (good_data == NULL) {
+               zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+               return;
+       }
+
+       if (c < rm->rm_firstdatacol) {
+               /*
+                * The first time through, calculate the parity blocks for
+                * the good data (this relies on the fact that the good
+                * data never changes for a given logical ZIO)
+                */
+               if (rm->rm_col[0].rc_gdata == NULL) {
+                       char *bad_parity[VDEV_RAIDZ_MAXPARITY];
+                       char *buf;
+
+                       /*
+                        * Set up the rm_col[]s to generate the parity for
+                        * good_data, first saving the parity bufs and
+                        * replacing them with buffers to hold the result.
+                        */
+                       for (x = 0; x < rm->rm_firstdatacol; x++) {
+                               bad_parity[x] = rm->rm_col[x].rc_data;
+                               rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
+                                   zio_buf_alloc(rm->rm_col[x].rc_size);
+                       }
+
+                       /* fill in the data columns from good_data */
+                       buf = (char *)good_data;
+                       for (; x < rm->rm_cols; x++) {
+                               rm->rm_col[x].rc_data = buf;
+                               buf += rm->rm_col[x].rc_size;
+                       }
+
+                       /*
+                        * Construct the parity from the good data.
+                        */
+                       vdev_raidz_generate_parity(rm);
+
+                       /* restore everything back to its original state */
+                       for (x = 0; x < rm->rm_firstdatacol; x++)
+                               rm->rm_col[x].rc_data = bad_parity[x];
+
+                       buf = rm->rm_datacopy;
+                       for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
+                               rm->rm_col[x].rc_data = buf;
+                               buf += rm->rm_col[x].rc_size;
+                       }
+               }
+
+               ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
+               good = rm->rm_col[c].rc_gdata;
+       } else {
+               /* adjust good_data to point at the start of our column */
+               good = good_data;
+
+               for (x = rm->rm_firstdatacol; x < c; x++)
+                       good += rm->rm_col[x].rc_size;
+       }
+
+       /* we drop the ereport if it ends up that the data was good */
+       zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+}
+
+/*
+ * Invoked indirectly by zfs_ereport_start_checksum(), called
+ * below when our read operation fails completely.  The main point
+ * is to keep a copy of everything we read from disk, so that at
+ * vdev_raidz_cksum_finish() time we can compare it with the good data.
+ */
+static void
+vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
+{
+       size_t c = (size_t)(uintptr_t)arg;
+       caddr_t buf;
+
+       raidz_map_t *rm = zio->io_vsd;
+       size_t size;
+
+       /* set up the report and bump the refcount  */
+       zcr->zcr_cbdata = rm;
+       zcr->zcr_cbinfo = c;
+       zcr->zcr_finish = vdev_raidz_cksum_finish;
+       zcr->zcr_free = vdev_raidz_cksum_free;
+
+       rm->rm_reports++;
+       ASSERT3U(rm->rm_reports, >, 0);
+
+       if (rm->rm_datacopy != NULL)
+               return;
+
+       /*
+        * It's the first time we're called for this raidz_map_t, so we need
+        * to copy the data aside; there's no guarantee that our zio's buffer
+        * won't be re-used for something else.
+        *
+        * Our parity data is already in separate buffers, so there's no need
+        * to copy them.
+        */
+
+       size = 0;
+       for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+               size += rm->rm_col[c].rc_size;
+
+       buf = rm->rm_datacopy = zio_buf_alloc(size);
+
+       for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+               raidz_col_t *col = &rm->rm_col[c];
+
+               bcopy(col->rc_data, buf, col->rc_size);
+               col->rc_data = buf;
+
+               buf += col->rc_size;
+       }
+       ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 }
 
+static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
+       vdev_raidz_map_free_vsd,
+       vdev_raidz_cksum_report
+};
+
 static raidz_map_t *
 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
     uint64_t nparity)
@@ -213,24 +440,40 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
        uint64_t s = zio->io_size >> unit_shift;
        uint64_t f = b % dcols;
        uint64_t o = (b / dcols) << unit_shift;
-       uint64_t q, r, c, bc, col, acols, coff, devidx;
+       uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 
        q = s / (dcols - nparity);
        r = s - q * (dcols - nparity);
        bc = (r == 0 ? 0 : r + nparity);
+       tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+       if (q == 0) {
+               acols = bc;
+               scols = MIN(dcols, roundup(bc, nparity + 1));
+       } else {
+               acols = dcols;
+               scols = dcols;
+       }
 
-       acols = (q == 0 ? bc : dcols);
+       ASSERT3U(acols, <=, scols);
 
-       rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+       rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_PUSHPAGE);
 
        rm->rm_cols = acols;
+       rm->rm_scols = scols;
        rm->rm_bigcols = bc;
-       rm->rm_asize = 0;
+       rm->rm_skipstart = bc;
        rm->rm_missingdata = 0;
        rm->rm_missingparity = 0;
        rm->rm_firstdatacol = nparity;
+       rm->rm_datacopy = NULL;
+       rm->rm_reports = 0;
+       rm->rm_freed = 0;
+       rm->rm_ecksuminjected = 0;
 
-       for (c = 0; c < acols; c++) {
+       asize = 0;
+
+       for (c = 0; c < scols; c++) {
                col = f + c;
                coff = o;
                if (col >= dcols) {
@@ -239,15 +482,27 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
                }
                rm->rm_col[c].rc_devidx = col;
                rm->rm_col[c].rc_offset = coff;
-               rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
                rm->rm_col[c].rc_data = NULL;
+               rm->rm_col[c].rc_gdata = NULL;
                rm->rm_col[c].rc_error = 0;
                rm->rm_col[c].rc_tried = 0;
                rm->rm_col[c].rc_skipped = 0;
-               rm->rm_asize += rm->rm_col[c].rc_size;
+
+               if (c >= acols)
+                       rm->rm_col[c].rc_size = 0;
+               else if (c < bc)
+                       rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+               else
+                       rm->rm_col[c].rc_size = q << unit_shift;
+
+               asize += rm->rm_col[c].rc_size;
        }
 
-       rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
+       ASSERT3U(asize, ==, tot << unit_shift);
+       rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+       rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+       ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
+       ASSERT3U(rm->rm_nskip, <=, nparity);
 
        for (c = 0; c < rm->rm_firstdatacol; c++)
                rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
@@ -272,6 +527,11 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
         * Unfortunately, this decision created an implicit on-disk format
         * requirement that we need to support for all eternity, but only
         * for single-parity RAID-Z.
+        *
+        * If we intend to skip a sector in the zeroth column for padding
+        * we must make sure to note this swap. We will never intend to
+        * skip the first column since at least one data and one parity
+        * column must appear in each row.
         */
        ASSERT(rm->rm_cols >= 2);
        ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
@@ -283,10 +543,13 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
                rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
                rm->rm_col[1].rc_devidx = devidx;
                rm->rm_col[1].rc_offset = o;
+
+               if (rm->rm_skipstart == 0)
+                       rm->rm_skipstart = 1;
        }
 
        zio->io_vsd = rm;
-       zio->io_vsd_free = vdev_raidz_map_free;
+       zio->io_vsd_ops = &vdev_raidz_vsd_ops;
        return (rm);
 }
 
@@ -305,12 +568,12 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
 
                if (c == rm->rm_firstdatacol) {
                        ASSERT(ccount == pcount);
-                       for (i = 0; i < ccount; i++, p++, src++) {
+                       for (i = 0; i < ccount; i++, src++, p++) {
                                *p = *src;
                        }
                } else {
                        ASSERT(ccount <= pcount);
-                       for (i = 0; i < ccount; i++, p++, src++) {
+                       for (i = 0; i < ccount; i++, src++, p++) {
                                *p ^= *src;
                        }
                }
@@ -320,10 +583,10 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
 static void
 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 {
-       uint64_t *q, *p, *src, pcount, ccount, mask, i;
+       uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
        int c;
 
-       pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+       pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
        ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
            rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 
@@ -331,55 +594,138 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
                src = rm->rm_col[c].rc_data;
                p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
                q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
-               ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+               ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 
                if (c == rm->rm_firstdatacol) {
-                       ASSERT(ccount == pcount || ccount == 0);
-                       for (i = 0; i < ccount; i++, p++, q++, src++) {
-                               *q = *src;
+                       ASSERT(ccnt == pcnt || ccnt == 0);
+                       for (i = 0; i < ccnt; i++, src++, p++, q++) {
                                *p = *src;
+                               *q = *src;
                        }
-                       for (; i < pcount; i++, p++, q++, src++) {
-                               *q = 0;
+                       for (; i < pcnt; i++, src++, p++, q++) {
                                *p = 0;
+                               *q = 0;
                        }
                } else {
-                       ASSERT(ccount <= pcount);
+                       ASSERT(ccnt <= pcnt);
 
                        /*
-                        * Rather than multiplying each byte individually (as
-                        * described above), we are able to handle 8 at once
-                        * by generating a mask based on the high bit in each
-                        * byte and using that to conditionally XOR in 0x1d.
+                        * Apply the algorithm described above by multiplying
+                        * the previous result and adding in the new value.
                         */
-                       for (i = 0; i < ccount; i++, p++, q++, src++) {
-                               mask = *q & 0x8080808080808080ULL;
-                               mask = (mask << 1) - (mask >> 7);
-                               *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
-                                   (mask & 0x1d1d1d1d1d1d1d1dULL);
+                       for (i = 0; i < ccnt; i++, src++, p++, q++) {
+                               *p ^= *src;
+
+                               VDEV_RAIDZ_64MUL_2(*q, mask);
                                *q ^= *src;
+                       }
+
+                       /*
+                        * Treat short columns as though they are full of 0s.
+                        * Note that there's therefore nothing needed for P.
+                        */
+                       for (; i < pcnt; i++, q++) {
+                               VDEV_RAIDZ_64MUL_2(*q, mask);
+                       }
+               }
+       }
+}
+
+static void
+vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
+{
+       uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
+       int c;
+
+       pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+       ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+           rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+       ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+           rm->rm_col[VDEV_RAIDZ_R].rc_size);
+
+       for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+               src = rm->rm_col[c].rc_data;
+               p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+               q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+               r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
+
+               ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+               if (c == rm->rm_firstdatacol) {
+                       ASSERT(ccnt == pcnt || ccnt == 0);
+                       for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
+                               *p = *src;
+                               *q = *src;
+                               *r = *src;
+                       }
+                       for (; i < pcnt; i++, src++, p++, q++, r++) {
+                               *p = 0;
+                               *q = 0;
+                               *r = 0;
+                       }
+               } else {
+                       ASSERT(ccnt <= pcnt);
+
+                       /*
+                        * Apply the algorithm described above by multiplying
+                        * the previous result and adding in the new value.
+                        */
+                       for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
                                *p ^= *src;
+
+                               VDEV_RAIDZ_64MUL_2(*q, mask);
+                               *q ^= *src;
+
+                               VDEV_RAIDZ_64MUL_4(*r, mask);
+                               *r ^= *src;
                        }
 
                        /*
                         * Treat short columns as though they are full of 0s.
+                        * Note that there's therefore nothing needed for P.
                         */
-                       for (; i < pcount; i++, q++) {
-                               mask = *q & 0x8080808080808080ULL;
-                               mask = (mask << 1) - (mask >> 7);
-                               *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
-                                   (mask & 0x1d1d1d1d1d1d1d1dULL);
+                       for (; i < pcnt; i++, q++, r++) {
+                               VDEV_RAIDZ_64MUL_2(*q, mask);
+                               VDEV_RAIDZ_64MUL_4(*r, mask);
                        }
                }
        }
 }
 
+/*
+ * Generate RAID parity in the first virtual columns according to the number of
+ * parity columns available.
+ */
 static void
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
+vdev_raidz_generate_parity(raidz_map_t *rm)
+{
+       switch (rm->rm_firstdatacol) {
+       case 1:
+               vdev_raidz_generate_parity_p(rm);
+               break;
+       case 2:
+               vdev_raidz_generate_parity_pq(rm);
+               break;
+       case 3:
+               vdev_raidz_generate_parity_pqr(rm);
+               break;
+       default:
+               cmn_err(CE_PANIC, "invalid RAID-Z configuration");
+       }
+}
+
+static int
+vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 {
        uint64_t *dst, *src, xcount, ccount, count, i;
+       int x = tgts[0];
        int c;
 
+       ASSERT(ntgts == 1);
+       ASSERT(x >= rm->rm_firstdatacol);
+       ASSERT(x < rm->rm_cols);
+
        xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
        ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
        ASSERT(xcount > 0);
@@ -404,15 +750,20 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
                        *dst ^= *src;
                }
        }
+
+       return (1 << VDEV_RAIDZ_P);
 }
 
-static void
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
+static int
+vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 {
        uint64_t *dst, *src, xcount, ccount, count, mask, i;
        uint8_t *b;
+       int x = tgts[0];
        int c, j, exp;
 
+       ASSERT(ntgts == 1);
+
        xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
        ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 
@@ -436,23 +787,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
                        }
 
                } else {
-                       /*
-                        * For an explanation of this, see the comment in
-                        * vdev_raidz_generate_parity_pq() above.
-                        */
                        for (i = 0; i < count; i++, dst++, src++) {
-                               mask = *dst & 0x8080808080808080ULL;
-                               mask = (mask << 1) - (mask >> 7);
-                               *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
-                                   (mask & 0x1d1d1d1d1d1d1d1dULL);
+                               VDEV_RAIDZ_64MUL_2(*dst, mask);
                                *dst ^= *src;
                        }
 
                        for (; i < xcount; i++, dst++) {
-                               mask = *dst & 0x8080808080808080ULL;
-                               mask = (mask << 1) - (mask >> 7);
-                               *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
-                                   (mask & 0x1d1d1d1d1d1d1d1dULL);
+                               VDEV_RAIDZ_64MUL_2(*dst, mask);
                        }
                }
        }
@@ -467,15 +808,20 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
                        *b = vdev_raidz_exp2(*b, exp);
                }
        }
+
+       return (1 << VDEV_RAIDZ_Q);
 }
 
-static void
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
+static int
+vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 {
        uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
        void *pdata, *qdata;
        uint64_t xsize, ysize, i;
+       int x = tgts[0];
+       int y = tgts[1];
 
+       ASSERT(ntgts == 2);
        ASSERT(x < y);
        ASSERT(x >= rm->rm_firstdatacol);
        ASSERT(y < rm->rm_cols);
@@ -553,93 +899,636 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
         */
        rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
        rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+
+       return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 }
 
+/* BEGIN CSTYLED */
+/*
+ * In the general case of reconstruction, we must solve the system of linear
+ * equations defined by the coeffecients used to generate parity as well as
+ * the contents of the data and parity disks. This can be expressed with
+ * vectors for the original data (D) and the actual data (d) and parity (p)
+ * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
+ *
+ *            __   __                     __     __
+ *            |     |         __     __   |  p_0  |
+ *            |  V  |         |  D_0  |   | p_m-1 |
+ *            |     |    x    |   :   | = |  d_0  |
+ *            |  I  |         | D_n-1 |   |   :   |
+ *            |     |         ~~     ~~   | d_n-1 |
+ *            ~~   ~~                     ~~     ~~
+ *
+ * I is simply a square identity matrix of size n, and V is a vandermonde
+ * matrix defined by the coeffecients we chose for the various parity columns
+ * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
+ * computation as well as linear separability.
+ *
+ *      __               __               __     __
+ *      |   1   ..  1 1 1 |               |  p_0  |
+ *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
+ *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
+ *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
+ *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
+ *      |   :       : : : |   |   :   |   |  d_2  |
+ *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
+ *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
+ *      |   0   ..  0 0 1 |               | d_n-1 |
+ *      ~~               ~~               ~~     ~~
+ *
+ * Note that I, V, d, and p are known. To compute D, we must invert the
+ * matrix and use the known data and parity values to reconstruct the unknown
+ * data values. We begin by removing the rows in V|I and d|p that correspond
+ * to failed or missing columns; we then make V|I square (n x n) and d|p
+ * sized n by removing rows corresponding to unused parity from the bottom up
+ * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
+ * using Gauss-Jordan elimination. In the example below we use m=3 parity
+ * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
+ *           __                               __
+ *           |  1   1   1   1   1   1   1   1  |
+ *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
+ *           |  19 205 116  29  64  16  4   1  |      / /
+ *           |  1   0   0   0   0   0   0   0  |     / /
+ *           |  0   1   0   0   0   0   0   0  | <--' /
+ *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
+ *           |  0   0   0   1   0   0   0   0  |
+ *           |  0   0   0   0   1   0   0   0  |
+ *           |  0   0   0   0   0   1   0   0  |
+ *           |  0   0   0   0   0   0   1   0  |
+ *           |  0   0   0   0   0   0   0   1  |
+ *           ~~                               ~~
+ *           __                               __
+ *           |  1   1   1   1   1   1   1   1  |
+ *           | 128  64  32  16  8   4   2   1  |
+ *           |  19 205 116  29  64  16  4   1  |
+ *           |  1   0   0   0   0   0   0   0  |
+ *           |  0   1   0   0   0   0   0   0  |
+ *  (V|I)' = |  0   0   1   0   0   0   0   0  |
+ *           |  0   0   0   1   0   0   0   0  |
+ *           |  0   0   0   0   1   0   0   0  |
+ *           |  0   0   0   0   0   1   0   0  |
+ *           |  0   0   0   0   0   0   1   0  |
+ *           |  0   0   0   0   0   0   0   1  |
+ *           ~~                               ~~
+ *
+ * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
+ * have carefully chosen the seed values 1, 2, and 4 to ensure that this
+ * matrix is not singular.
+ * __                                                                 __
+ * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
+ * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
+ * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
+ * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ *                   __                               __
+ *                   |  0   0   1   0   0   0   0   0  |
+ *                   | 167 100  5   41 159 169 217 208 |
+ *                   | 166 100  4   40 158 168 216 209 |
+ *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
+ *                   |  0   0   0   0   1   0   0   0  |
+ *                   |  0   0   0   0   0   1   0   0  |
+ *                   |  0   0   0   0   0   0   1   0  |
+ *                   |  0   0   0   0   0   0   0   1  |
+ *                   ~~                               ~~
+ *
+ * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
+ * of the missing data.
+ *
+ * As is apparent from the example above, the only non-trivial rows in the
+ * inverse matrix correspond to the data disks that we're trying to
+ * reconstruct. Indeed, those are the only rows we need as the others would
+ * only be useful for reconstructing data known or assumed to be valid. For
+ * that reason, we only build the coefficients in the rows that correspond to
+ * targeted columns.
+ */
+/* END CSTYLED */
 
-static int
-vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+static void
+vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+    uint8_t **rows)
 {
-       vdev_t *cvd;
-       uint64_t nparity = vd->vdev_nparity;
-       int c, error;
-       int lasterror = 0;
-       int numerrors = 0;
+       int i, j;
+       int pow;
 
-       ASSERT(nparity > 0);
+       ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
 
-       if (nparity > VDEV_RAIDZ_MAXPARITY ||
-           vd->vdev_children < nparity + 1) {
-               vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-               return (EINVAL);
+       /*
+        * Fill in the missing rows of interest.
+        */
+       for (i = 0; i < nmap; i++) {
+               ASSERT3S(0, <=, map[i]);
+               ASSERT3S(map[i], <=, 2);
+
+               pow = map[i] * n;
+               if (pow > 255)
+                       pow -= 255;
+               ASSERT(pow <= 255);
+
+               for (j = 0; j < n; j++) {
+                       pow -= map[i];
+                       if (pow < 0)
+                               pow += 255;
+                       rows[i][j] = vdev_raidz_pow2[pow];
+               }
        }
+}
 
-       for (c = 0; c < vd->vdev_children; c++) {
-               cvd = vd->vdev_child[c];
+static void
+vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
+{
+       int i, j, ii, jj;
+       uint8_t log;
 
-               if ((error = vdev_open(cvd)) != 0) {
-                       lasterror = error;
-                       numerrors++;
-                       continue;
+       /*
+        * Assert that the first nmissing entries from the array of used
+        * columns correspond to parity columns and that subsequent entries
+        * correspond to data columns.
+        */
+       for (i = 0; i < nmissing; i++) {
+               ASSERT3S(used[i], <, rm->rm_firstdatacol);
+       }
+       for (; i < n; i++) {
+               ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+       }
+
+       /*
+        * First initialize the storage where we'll compute the inverse rows.
+        */
+       for (i = 0; i < nmissing; i++) {
+               for (j = 0; j < n; j++) {
+                       invrows[i][j] = (i == j) ? 1 : 0;
                }
+       }
 
-               *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
-               *ashift = MAX(*ashift, cvd->vdev_ashift);
+       /*
+        * Subtract all trivial rows from the rows of consequence.
+        */
+       for (i = 0; i < nmissing; i++) {
+               for (j = nmissing; j < n; j++) {
+                       ASSERT3U(used[j], >=, rm->rm_firstdatacol);
+                       jj = used[j] - rm->rm_firstdatacol;
+                       ASSERT3S(jj, <, n);
+                       invrows[i][j] = rows[i][jj];
+                       rows[i][jj] = 0;
+               }
        }
 
-       *asize *= vd->vdev_children;
+       /*
+        * For each of the rows of interest, we must normalize it and subtract
+        * a multiple of it from the other rows.
+        */
+       for (i = 0; i < nmissing; i++) {
+               for (j = 0; j < missing[i]; j++) {
+                       ASSERT0(rows[i][j]);
+               }
+               ASSERT3U(rows[i][missing[i]], !=, 0);
 
-       if (numerrors > nparity) {
-               vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
-               return (lasterror);
-       }
+               /*
+                * Compute the inverse of the first element and multiply each
+                * element in the row by that value.
+                */
+               log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
 
-       return (0);
-}
+               for (j = 0; j < n; j++) {
+                       rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
+                       invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
+               }
 
-static void
-vdev_raidz_close(vdev_t *vd)
-{
-       int c;
+               for (ii = 0; ii < nmissing; ii++) {
+                       if (i == ii)
+                               continue;
 
-       for (c = 0; c < vd->vdev_children; c++)
-               vdev_close(vd->vdev_child[c]);
-}
+                       ASSERT3U(rows[ii][missing[i]], !=, 0);
 
-static uint64_t
-vdev_raidz_asize(vdev_t *vd, uint64_t psize)
-{
-       uint64_t asize;
-       uint64_t ashift = vd->vdev_top->vdev_ashift;
-       uint64_t cols = vd->vdev_children;
-       uint64_t nparity = vd->vdev_nparity;
+                       log = vdev_raidz_log2[rows[ii][missing[i]]];
 
-       asize = ((psize - 1) >> ashift) + 1;
-       asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
-       asize = roundup(asize, nparity + 1) << ashift;
+                       for (j = 0; j < n; j++) {
+                               rows[ii][j] ^=
+                                   vdev_raidz_exp2(rows[i][j], log);
+                               invrows[ii][j] ^=
+                                   vdev_raidz_exp2(invrows[i][j], log);
+                       }
+               }
+       }
 
-       return (asize);
+       /*
+        * Verify that the data that is left in the rows are properly part of
+        * an identity matrix.
+        */
+       for (i = 0; i < nmissing; i++) {
+               for (j = 0; j < n; j++) {
+                       if (j == missing[i]) {
+                               ASSERT3U(rows[i][j], ==, 1);
+                       } else {
+                               ASSERT0(rows[i][j]);
+                       }
+               }
+       }
 }
 
 static void
-vdev_raidz_child_done(zio_t *zio)
+vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+    int *missing, uint8_t **invrows, const uint8_t *used)
 {
-       raidz_col_t *rc = zio->io_private;
+       int i, j, x, cc, c;
+       uint8_t *src;
+       uint64_t ccount;
+       uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
+       uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
+       uint8_t log = 0, val;
+       int ll;
+       uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
+       uint8_t *p, *pp;
+       size_t psize;
+
+       psize = sizeof (invlog[0][0]) * n * nmissing;
+       p = kmem_alloc(psize, KM_PUSHPAGE);
+
+       for (pp = p, i = 0; i < nmissing; i++) {
+               invlog[i] = pp;
+               pp += n;
+       }
 
-       rc->rc_error = zio->io_error;
-       rc->rc_tried = 1;
-       rc->rc_skipped = 0;
-}
+       for (i = 0; i < nmissing; i++) {
+               for (j = 0; j < n; j++) {
+                       ASSERT3U(invrows[i][j], !=, 0);
+                       invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
+               }
+       }
 
-static int
-vdev_raidz_io_start(zio_t *zio)
+       for (i = 0; i < n; i++) {
+               c = used[i];
+               ASSERT3U(c, <, rm->rm_cols);
+
+               src = rm->rm_col[c].rc_data;
+               ccount = rm->rm_col[c].rc_size;
+               for (j = 0; j < nmissing; j++) {
+                       cc = missing[j] + rm->rm_firstdatacol;
+                       ASSERT3U(cc, >=, rm->rm_firstdatacol);
+                       ASSERT3U(cc, <, rm->rm_cols);
+                       ASSERT3U(cc, !=, c);
+
+                       dst[j] = rm->rm_col[cc].rc_data;
+                       dcount[j] = rm->rm_col[cc].rc_size;
+               }
+
+               ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
+
+               for (x = 0; x < ccount; x++, src++) {
+                       if (*src != 0)
+                               log = vdev_raidz_log2[*src];
+
+                       for (cc = 0; cc < nmissing; cc++) {
+                               if (x >= dcount[cc])
+                                       continue;
+
+                               if (*src == 0) {
+                                       val = 0;
+                               } else {
+                                       if ((ll = log + invlog[cc][i]) >= 255)
+                                               ll -= 255;
+                                       val = vdev_raidz_pow2[ll];
+                               }
+
+                               if (i == 0)
+                                       dst[cc][x] = val;
+                               else
+                                       dst[cc][x] ^= val;
+                       }
+               }
+       }
+
+       kmem_free(p, psize);
+}
+
+static int
+vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+{
+       int n, i, c, t, tt;
+       int nmissing_rows;
+       int missing_rows[VDEV_RAIDZ_MAXPARITY];
+       int parity_map[VDEV_RAIDZ_MAXPARITY];
+
+       uint8_t *p, *pp;
+       size_t psize;
+
+       uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
+       uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
+       uint8_t *used;
+
+       int code = 0;
+
+
+       n = rm->rm_cols - rm->rm_firstdatacol;
+
+       /*
+        * Figure out which data columns are missing.
+        */
+       nmissing_rows = 0;
+       for (t = 0; t < ntgts; t++) {
+               if (tgts[t] >= rm->rm_firstdatacol) {
+                       missing_rows[nmissing_rows++] =
+                           tgts[t] - rm->rm_firstdatacol;
+               }
+       }
+
+       /*
+        * Figure out which parity columns to use to help generate the missing
+        * data columns.
+        */
+       for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
+               ASSERT(tt < ntgts);
+               ASSERT(c < rm->rm_firstdatacol);
+
+               /*
+                * Skip any targeted parity columns.
+                */
+               if (c == tgts[tt]) {
+                       tt++;
+                       continue;
+               }
+
+               code |= 1 << c;
+
+               parity_map[i] = c;
+               i++;
+       }
+
+       ASSERT(code != 0);
+       ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
+
+       psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
+           nmissing_rows * n + sizeof (used[0]) * n;
+       p = kmem_alloc(psize, KM_PUSHPAGE);
+
+       for (pp = p, i = 0; i < nmissing_rows; i++) {
+               rows[i] = pp;
+               pp += n;
+               invrows[i] = pp;
+               pp += n;
+       }
+       used = pp;
+
+       for (i = 0; i < nmissing_rows; i++) {
+               used[i] = parity_map[i];
+       }
+
+       for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+               if (tt < nmissing_rows &&
+                   c == missing_rows[tt] + rm->rm_firstdatacol) {
+                       tt++;
+                       continue;
+               }
+
+               ASSERT3S(i, <, n);
+               used[i] = c;
+               i++;
+       }
+
+       /*
+        * Initialize the interesting rows of the matrix.
+        */
+       vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+
+       /*
+        * Invert the matrix.
+        */
+       vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+           invrows, used);
+
+       /*
+        * Reconstruct the missing data using the generated matrix.
+        */
+       vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+           invrows, used);
+
+       kmem_free(p, psize);
+
+       return (code);
+}
+
+static int
+vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
+{
+       int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
+       int ntgts;
+       int i, c;
+       int code;
+       int nbadparity, nbaddata;
+       int parity_valid[VDEV_RAIDZ_MAXPARITY];
+
+       /*
+        * The tgts list must already be sorted.
+        */
+       for (i = 1; i < nt; i++) {
+               ASSERT(t[i] > t[i - 1]);
+       }
+
+       nbadparity = rm->rm_firstdatacol;
+       nbaddata = rm->rm_cols - nbadparity;
+       ntgts = 0;
+       for (i = 0, c = 0; c < rm->rm_cols; c++) {
+               if (c < rm->rm_firstdatacol)
+                       parity_valid[c] = B_FALSE;
+
+               if (i < nt && c == t[i]) {
+                       tgts[ntgts++] = c;
+                       i++;
+               } else if (rm->rm_col[c].rc_error != 0) {
+                       tgts[ntgts++] = c;
+               } else if (c >= rm->rm_firstdatacol) {
+                       nbaddata--;
+               } else {
+                       parity_valid[c] = B_TRUE;
+                       nbadparity--;
+               }
+       }
+
+       ASSERT(ntgts >= nt);
+       ASSERT(nbaddata >= 0);
+       ASSERT(nbaddata + nbadparity == ntgts);
+
+       dt = &tgts[nbadparity];
+
+       /*
+        * See if we can use any of our optimized reconstruction routines.
+        */
+       if (!vdev_raidz_default_to_general) {
+               switch (nbaddata) {
+               case 1:
+                       if (parity_valid[VDEV_RAIDZ_P])
+                               return (vdev_raidz_reconstruct_p(rm, dt, 1));
+
+                       ASSERT(rm->rm_firstdatacol > 1);
+
+                       if (parity_valid[VDEV_RAIDZ_Q])
+                               return (vdev_raidz_reconstruct_q(rm, dt, 1));
+
+                       ASSERT(rm->rm_firstdatacol > 2);
+                       break;
+
+               case 2:
+                       ASSERT(rm->rm_firstdatacol > 1);
+
+                       if (parity_valid[VDEV_RAIDZ_P] &&
+                           parity_valid[VDEV_RAIDZ_Q])
+                               return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+
+                       ASSERT(rm->rm_firstdatacol > 2);
+
+                       break;
+               }
+       }
+
+       code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+       ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
+       ASSERT(code > 0);
+       return (code);
+}
+
+static int
+vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+    uint64_t *ashift)
+{
+       vdev_t *cvd;
+       uint64_t nparity = vd->vdev_nparity;
+       int c;
+       int lasterror = 0;
+       int numerrors = 0;
+
+       ASSERT(nparity > 0);
+
+       if (nparity > VDEV_RAIDZ_MAXPARITY ||
+           vd->vdev_children < nparity + 1) {
+               vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+               return (EINVAL);
+       }
+
+       vdev_open_children(vd);
+
+       for (c = 0; c < vd->vdev_children; c++) {
+               cvd = vd->vdev_child[c];
+
+               if (cvd->vdev_open_error != 0) {
+                       lasterror = cvd->vdev_open_error;
+                       numerrors++;
+                       continue;
+               }
+
+               *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+               *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
+               *ashift = MAX(*ashift, cvd->vdev_ashift);
+       }
+
+       *asize *= vd->vdev_children;
+       *max_asize *= vd->vdev_children;
+
+       if (numerrors > nparity) {
+               vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+               return (lasterror);
+       }
+
+       return (0);
+}
+
+static void
+vdev_raidz_close(vdev_t *vd)
+{
+       int c;
+
+       for (c = 0; c < vd->vdev_children; c++)
+               vdev_close(vd->vdev_child[c]);
+}
+
+static uint64_t
+vdev_raidz_asize(vdev_t *vd, uint64_t psize)
+{
+       uint64_t asize;
+       uint64_t ashift = vd->vdev_top->vdev_ashift;
+       uint64_t cols = vd->vdev_children;
+       uint64_t nparity = vd->vdev_nparity;
+
+       asize = ((psize - 1) >> ashift) + 1;
+       asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
+       asize = roundup(asize, nparity + 1) << ashift;
+
+       return (asize);
+}
+
+static void
+vdev_raidz_child_done(zio_t *zio)
+{
+       raidz_col_t *rc = zio->io_private;
+
+       rc->rc_error = zio->io_error;
+       rc->rc_tried = 1;
+       rc->rc_skipped = 0;
+}
+
+static int
+vdev_raidz_io_start(zio_t *zio)
 {
        vdev_t *vd = zio->io_vd;
        vdev_t *tvd = vd->vdev_top;
        vdev_t *cvd;
-       blkptr_t *bp = zio->io_bp;
        raidz_map_t *rm;
        raidz_col_t *rc;
-       int c;
+       int c, i;
 
        rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
            vd->vdev_nparity);
@@ -647,13 +1536,7 @@ vdev_raidz_io_start(zio_t *zio)
        ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
        if (zio->io_type == ZIO_TYPE_WRITE) {
-               /*
-                * Generate RAID parity in the first virtual columns.
-                */
-               if (rm->rm_firstdatacol == 1)
-                       vdev_raidz_generate_parity_p(rm);
-               else
-                       vdev_raidz_generate_parity_pq(rm);
+               vdev_raidz_generate_parity(rm);
 
                for (c = 0; c < rm->rm_cols; c++) {
                        rc = &rm->rm_col[c];
@@ -664,6 +1547,23 @@ vdev_raidz_io_start(zio_t *zio)
                            vdev_raidz_child_done, rc));
                }
 
+               /*
+                * Generate optional I/Os for any skipped sectors to improve
+                * aggregation contiguity.
+                */
+               for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
+                       ASSERT(c <= rm->rm_scols);
+                       if (c == rm->rm_scols)
+                               c = 0;
+                       rc = &rm->rm_col[c];
+                       cvd = vd->vdev_child[rc->rc_devidx];
+                       zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+                           rc->rc_offset + rc->rc_size, NULL,
+                           1 << tvd->vdev_ashift,
+                           zio->io_type, zio->io_priority,
+                           ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+               }
+
                return (ZIO_PIPELINE_CONTINUE);
        }
 
@@ -671,8 +1571,7 @@ vdev_raidz_io_start(zio_t *zio)
 
        /*
         * Iterate over the columns in reverse order so that we hit the parity
-        * last -- any errors along the way will force us to read the parity
-        * data.
+        * last -- any errors along the way will force us to read the parity.
         */
        for (c = rm->rm_cols - 1; c >= 0; c--) {
                rc = &rm->rm_col[c];
@@ -687,7 +1586,7 @@ vdev_raidz_io_start(zio_t *zio)
                        rc->rc_skipped = 1;
                        continue;
                }
-               if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
+               if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
                        if (c >= rm->rm_firstdatacol)
                                rm->rm_missingdata++;
                        else
@@ -697,7 +1596,7 @@ vdev_raidz_io_start(zio_t *zio)
                        continue;
                }
                if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
-                   (zio->io_flags & ZIO_FLAG_SCRUB)) {
+                   (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
                        zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
                            rc->rc_offset, rc->rc_data, rc->rc_size,
                            zio->io_type, zio->io_priority, 0,
@@ -708,23 +1607,50 @@ vdev_raidz_io_start(zio_t *zio)
        return (ZIO_PIPELINE_CONTINUE);
 }
 
+
 /*
  * Report a checksum error for a child of a RAID-Z device.
  */
 static void
-raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
 {
        vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
 
        if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+               zio_bad_cksum_t zbc;
+               raidz_map_t *rm = zio->io_vsd;
+
                mutex_enter(&vd->vdev_stat_lock);
                vd->vdev_stat.vs_checksum_errors++;
                mutex_exit(&vd->vdev_stat_lock);
+
+               zbc.zbc_has_cksum = 0;
+               zbc.zbc_injected = rm->rm_ecksuminjected;
+
+               zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+                   rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
+                   &zbc);
        }
+}
+
+/*
+ * We keep track of whether or not there were any injected errors, so that
+ * any ereports we generate can note it.
+ */
+static int
+raidz_checksum_verify(zio_t *zio)
+{
+       zio_bad_cksum_t zbc;
+       raidz_map_t *rm = zio->io_vsd;
+       int ret;
 
-       if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
-               zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-                   zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
+       bzero(&zbc, sizeof (zio_bad_cksum_t));
+
+       ret = zio_checksum_error(zio, &zbc);
+       if (ret != 0 && zbc.zbc_injected != 0)
+               rm->rm_ecksuminjected = 1;
+
+       return (ret);
 }
 
 /*
@@ -748,17 +1674,14 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
                bcopy(rc->rc_data, orig[c], rc->rc_size);
        }
 
-       if (rm->rm_firstdatacol == 1)
-               vdev_raidz_generate_parity_p(rm);
-       else
-               vdev_raidz_generate_parity_pq(rm);
+       vdev_raidz_generate_parity(rm);
 
        for (c = 0; c < rm->rm_firstdatacol; c++) {
                rc = &rm->rm_col[c];
                if (!rc->rc_tried || rc->rc_error != 0)
                        continue;
                if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
-                       raidz_checksum_error(zio, rc);
+                       raidz_checksum_error(zio, rc, orig[c]);
                        rc->rc_error = ECKSUM;
                        ret++;
                }
@@ -768,34 +1691,193 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
        return (ret);
 }
 
-static uint64_t raidz_corrected_p;
-static uint64_t raidz_corrected_q;
-static uint64_t raidz_corrected_pq;
+/*
+ * Keep statistics on all the ways that we used parity to correct data.
+ */
+static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
 
 static int
 vdev_raidz_worst_error(raidz_map_t *rm)
 {
-       int error = 0;
+       int c, error = 0;
 
-       for (int c = 0; c < rm->rm_cols; c++)
+       for (c = 0; c < rm->rm_cols; c++)
                error = zio_worst_error(error, rm->rm_col[c].rc_error);
 
        return (error);
 }
 
+/*
+ * Iterate over all combinations of bad data and attempt a reconstruction.
+ * Note that the algorithm below is non-optimal because it doesn't take into
+ * account how reconstruction is actually performed. For example, with
+ * triple-parity RAID-Z the reconstruction procedure is the same if column 4
+ * is targeted as invalid as if columns 1 and 4 are targeted since in both
+ * cases we'd only use parity information in column 0.
+ */
+static int
+vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
+{
+       raidz_map_t *rm = zio->io_vsd;
+       raidz_col_t *rc;
+       void *orig[VDEV_RAIDZ_MAXPARITY];
+       int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+       int *tgts = &tstore[1];
+       int curr, next, i, c, n;
+       int code, ret = 0;
+
+       ASSERT(total_errors < rm->rm_firstdatacol);
+
+       /*
+        * This simplifies one edge condition.
+        */
+       tgts[-1] = -1;
+
+       for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
+               /*
+                * Initialize the targets array by finding the first n columns
+                * that contain no error.
+                *
+                * If there were no data errors, we need to ensure that we're
+                * always explicitly attempting to reconstruct at least one
+                * data column. To do this, we simply push the highest target
+                * up into the data columns.
+                */
+               for (c = 0, i = 0; i < n; i++) {
+                       if (i == n - 1 && data_errors == 0 &&
+                           c < rm->rm_firstdatacol) {
+                               c = rm->rm_firstdatacol;
+                       }
+
+                       while (rm->rm_col[c].rc_error != 0) {
+                               c++;
+                               ASSERT3S(c, <, rm->rm_cols);
+                       }
+
+                       tgts[i] = c++;
+               }
+
+               /*
+                * Setting tgts[n] simplifies the other edge condition.
+                */
+               tgts[n] = rm->rm_cols;
+
+               /*
+                * These buffers were allocated in previous iterations.
+                */
+               for (i = 0; i < n - 1; i++) {
+                       ASSERT(orig[i] != NULL);
+               }
+
+               orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
+
+               curr = 0;
+               next = tgts[curr];
+
+               while (curr != n) {
+                       tgts[curr] = next;
+                       curr = 0;
+
+                       /*
+                        * Save off the original data that we're going to
+                        * attempt to reconstruct.
+                        */
+                       for (i = 0; i < n; i++) {
+                               ASSERT(orig[i] != NULL);
+                               c = tgts[i];
+                               ASSERT3S(c, >=, 0);
+                               ASSERT3S(c, <, rm->rm_cols);
+                               rc = &rm->rm_col[c];
+                               bcopy(rc->rc_data, orig[i], rc->rc_size);
+                       }
+
+                       /*
+                        * Attempt a reconstruction and exit the outer loop on
+                        * success.
+                        */
+                       code = vdev_raidz_reconstruct(rm, tgts, n);
+                       if (raidz_checksum_verify(zio) == 0) {
+                               atomic_inc_64(&raidz_corrected[code]);
+
+                               for (i = 0; i < n; i++) {
+                                       c = tgts[i];
+                                       rc = &rm->rm_col[c];
+                                       ASSERT(rc->rc_error == 0);
+                                       if (rc->rc_tried)
+                                               raidz_checksum_error(zio, rc,
+                                                   orig[i]);
+                                       rc->rc_error = ECKSUM;
+                               }
+
+                               ret = code;
+                               goto done;
+                       }
+
+                       /*
+                        * Restore the original data.
+                        */
+                       for (i = 0; i < n; i++) {
+                               c = tgts[i];
+                               rc = &rm->rm_col[c];
+                               bcopy(orig[i], rc->rc_data, rc->rc_size);
+                       }
+
+                       do {
+                               /*
+                                * Find the next valid column after the curr
+                                * position..
+                                */
+                               for (next = tgts[curr] + 1;
+                                   next < rm->rm_cols &&
+                                   rm->rm_col[next].rc_error != 0; next++)
+                                       continue;
+
+                               ASSERT(next <= tgts[curr + 1]);
+
+                               /*
+                                * If that spot is available, we're done here.
+                                */
+                               if (next != tgts[curr + 1])
+                                       break;
+
+                               /*
+                                * Otherwise, find the next valid column after
+                                * the previous position.
+                                */
+                               for (c = tgts[curr - 1] + 1;
+                                   rm->rm_col[c].rc_error != 0; c++)
+                                       continue;
+
+                               tgts[curr] = c;
+                               curr++;
+
+                       } while (curr != n);
+               }
+       }
+       n--;
+done:
+       for (i = 0; i < n; i++) {
+               zio_buf_free(orig[i], rm->rm_col[0].rc_size);
+       }
+
+       return (ret);
+}
+
 static void
 vdev_raidz_io_done(zio_t *zio)
 {
        vdev_t *vd = zio->io_vd;
        vdev_t *cvd;
        raidz_map_t *rm = zio->io_vsd;
-       raidz_col_t *rc, *rc1;
+       raidz_col_t *rc = NULL;
        int unexpected_errors = 0;
        int parity_errors = 0;
        int parity_untried = 0;
        int data_errors = 0;
        int total_errors = 0;
-       int n, c, c1;
+       int n, c;
+       int tgts[VDEV_RAIDZ_MAXPARITY];
+       int code;
 
        ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
 
@@ -859,9 +1941,8 @@ vdev_raidz_io_done(zio_t *zio)
         * any errors.
         */
        if (total_errors <= rm->rm_firstdatacol - parity_untried) {
-               switch (data_errors) {
-               case 0:
-                       if (zio_checksum_error(zio) == 0) {
+               if (data_errors == 0) {
+                       if (raidz_checksum_verify(zio) == 0) {
                                /*
                                 * If we read parity information (unnecessarily
                                 * as it happens since no reconstruction was
@@ -880,9 +1961,7 @@ vdev_raidz_io_done(zio_t *zio)
                                }
                                goto done;
                        }
-                       break;
-
-               case 1:
+               } else {
                        /*
                         * We either attempt to read all the parity columns or
                         * none of them. If we didn't try to read parity, we
@@ -894,45 +1973,38 @@ vdev_raidz_io_done(zio_t *zio)
                        ASSERT(parity_errors < rm->rm_firstdatacol);
 
                        /*
-                        * Find the column that reported the error.
+                        * Identify the data columns that reported an error.
                         */
+                       n = 0;
                        for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
                                rc = &rm->rm_col[c];
-                               if (rc->rc_error != 0)
-                                       break;
+                               if (rc->rc_error != 0) {
+                                       ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+                                       tgts[n++] = c;
+                               }
                        }
-                       ASSERT(c != rm->rm_cols);
-                       ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-                           rc->rc_error == ESTALE);
 
-                       if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
-                               vdev_raidz_reconstruct_p(rm, c);
-                       } else {
-                               ASSERT(rm->rm_firstdatacol > 1);
-                               vdev_raidz_reconstruct_q(rm, c);
-                       }
+                       ASSERT(rm->rm_firstdatacol >= n);
 
-                       if (zio_checksum_error(zio) == 0) {
-                               if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
-                                       atomic_inc_64(&raidz_corrected_p);
-                               else
-                                       atomic_inc_64(&raidz_corrected_q);
+                       code = vdev_raidz_reconstruct(rm, tgts, n);
+
+                       if (raidz_checksum_verify(zio) == 0) {
+                               atomic_inc_64(&raidz_corrected[code]);
 
                                /*
-                                * If there's more than one parity disk that
-                                * was successfully read, confirm that the
-                                * other parity disk produced the correct data.
-                                * This routine is suboptimal in that it
-                                * regenerates both the parity we wish to test
-                                * as well as the parity we just used to
-                                * perform the reconstruction, but this should
-                                * be a relatively uncommon case, and can be
-                                * optimized if it becomes a problem.
-                                * We also regenerate parity when resilvering
-                                * so we can write it out to the failed device
-                                * later.
+                                * If we read more parity disks than were used
+                                * for reconstruction, confirm that the other
+                                * parity disks produced correct data. This
+                                * routine is suboptimal in that it regenerates
+                                * the parity that we already used in addition
+                                * to the parity that we're attempting to
+                                * verify, but this should be a relatively
+                                * uncommon case, and can be optimized if it
+                                * becomes a problem. Note that we regenerate
+                                * parity when resilvering so we can write it
+                                * out to failed devices later.
                                 */
-                               if (parity_errors < rm->rm_firstdatacol - 1 ||
+                               if (parity_errors < rm->rm_firstdatacol - n ||
                                    (zio->io_flags & ZIO_FLAG_RESILVER)) {
                                        n = raidz_parity_verify(zio, rm);
                                        unexpected_errors += n;
@@ -942,46 +2014,6 @@ vdev_raidz_io_done(zio_t *zio)
 
                                goto done;
                        }
-                       break;
-
-               case 2:
-                       /*
-                        * Two data column errors require double parity.
-                        */
-                       ASSERT(rm->rm_firstdatacol == 2);
-
-                       /*
-                        * Find the two columns that reported errors.
-                        */
-                       for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-                               rc = &rm->rm_col[c];
-                               if (rc->rc_error != 0)
-                                       break;
-                       }
-                       ASSERT(c != rm->rm_cols);
-                       ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-                           rc->rc_error == ESTALE);
-
-                       for (c1 = c++; c < rm->rm_cols; c++) {
-                               rc = &rm->rm_col[c];
-                               if (rc->rc_error != 0)
-                                       break;
-                       }
-                       ASSERT(c != rm->rm_cols);
-                       ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-                           rc->rc_error == ESTALE);
-
-                       vdev_raidz_reconstruct_pq(rm, c1, c);
-
-                       if (zio_checksum_error(zio) == 0) {
-                               atomic_inc_64(&raidz_corrected_pq);
-                               goto done;
-                       }
-                       break;
-
-               default:
-                       ASSERT(rm->rm_firstdatacol <= 2);
-                       ASSERT(0);
                }
        }
 
@@ -1020,152 +2052,61 @@ vdev_raidz_io_done(zio_t *zio)
         * errors we detected, and we've attempted to read all columns. There
         * must, therefore, be one or more additional problems -- silent errors
         * resulting in invalid data rather than explicit I/O errors resulting
-        * in absent data. Before we attempt combinatorial reconstruction make
-        * sure we have a chance of coming up with the right answer.
+        * in absent data. We check if there is enough additional data to
+        * possibly reconstruct the data and then perform combinatorial
+        * reconstruction over all possible combinations. If that fails,
+        * we're cooked.
         */
-       if (total_errors >= rm->rm_firstdatacol) {
+       if (total_errors > rm->rm_firstdatacol) {
                zio->io_error = vdev_raidz_worst_error(rm);
-               /*
-                * If there were exactly as many device errors as parity
-                * columns, yet we couldn't reconstruct the data, then at
-                * least one device must have returned bad data silently.
-                */
-               if (total_errors == rm->rm_firstdatacol)
-                       zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
-               goto done;
-       }
-
-       if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
-               /*
-                * Attempt to reconstruct the data from parity P.
-                */
-               for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-                       void *orig;
-                       rc = &rm->rm_col[c];
-
-                       orig = zio_buf_alloc(rc->rc_size);
-                       bcopy(rc->rc_data, orig, rc->rc_size);
-                       vdev_raidz_reconstruct_p(rm, c);
-
-                       if (zio_checksum_error(zio) == 0) {
-                               zio_buf_free(orig, rc->rc_size);
-                               atomic_inc_64(&raidz_corrected_p);
 
-                               /*
-                                * If this child didn't know that it returned
-                                * bad data, inform it.
-                                */
-                               if (rc->rc_tried && rc->rc_error == 0)
-                                       raidz_checksum_error(zio, rc);
-                               rc->rc_error = ECKSUM;
-                               goto done;
-                       }
-
-                       bcopy(orig, rc->rc_data, rc->rc_size);
-                       zio_buf_free(orig, rc->rc_size);
-               }
-       }
-
-       if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+       } else if (total_errors < rm->rm_firstdatacol &&
+           (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
                /*
-                * Attempt to reconstruct the data from parity Q.
+                * If we didn't use all the available parity for the
+                * combinatorial reconstruction, verify that the remaining
+                * parity is correct.
                 */
-               for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-                       void *orig;
-                       rc = &rm->rm_col[c];
-
-                       orig = zio_buf_alloc(rc->rc_size);
-                       bcopy(rc->rc_data, orig, rc->rc_size);
-                       vdev_raidz_reconstruct_q(rm, c);
-
-                       if (zio_checksum_error(zio) == 0) {
-                               zio_buf_free(orig, rc->rc_size);
-                               atomic_inc_64(&raidz_corrected_q);
-
-                               /*
-                                * If this child didn't know that it returned
-                                * bad data, inform it.
-                                */
-                               if (rc->rc_tried && rc->rc_error == 0)
-                                       raidz_checksum_error(zio, rc);
-                               rc->rc_error = ECKSUM;
-                               goto done;
-                       }
-
-                       bcopy(orig, rc->rc_data, rc->rc_size);
-                       zio_buf_free(orig, rc->rc_size);
-               }
-       }
-
-       if (rm->rm_firstdatacol > 1 &&
-           rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
-           rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+               if (code != (1 << rm->rm_firstdatacol) - 1)
+                       (void) raidz_parity_verify(zio, rm);
+       } else {
                /*
-                * Attempt to reconstruct the data from both P and Q.
+                * We're here because either:
+                *
+                *      total_errors == rm_first_datacol, or
+                *      vdev_raidz_combrec() failed
+                *
+                * In either case, there is enough bad data to prevent
+                * reconstruction.
+                *
+                * Start checksum ereports for all children which haven't
+                * failed, and the IO wasn't speculative.
                 */
-               for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
-                       void *orig, *orig1;
-                       rc = &rm->rm_col[c];
-
-                       orig = zio_buf_alloc(rc->rc_size);
-                       bcopy(rc->rc_data, orig, rc->rc_size);
-
-                       for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
-                               rc1 = &rm->rm_col[c1];
-
-                               orig1 = zio_buf_alloc(rc1->rc_size);
-                               bcopy(rc1->rc_data, orig1, rc1->rc_size);
-
-                               vdev_raidz_reconstruct_pq(rm, c, c1);
+               zio->io_error = ECKSUM;
 
-                               if (zio_checksum_error(zio) == 0) {
-                                       zio_buf_free(orig, rc->rc_size);
-                                       zio_buf_free(orig1, rc1->rc_size);
-                                       atomic_inc_64(&raidz_corrected_pq);
-
-                                       /*
-                                        * If these children didn't know they
-                                        * returned bad data, inform them.
-                                        */
-                                       if (rc->rc_tried && rc->rc_error == 0)
-                                               raidz_checksum_error(zio, rc);
-                                       if (rc1->rc_tried && rc1->rc_error == 0)
-                                               raidz_checksum_error(zio, rc1);
-
-                                       rc->rc_error = ECKSUM;
-                                       rc1->rc_error = ECKSUM;
-
-                                       goto done;
+               if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+                       for (c = 0; c < rm->rm_cols; c++) {
+                               rc = &rm->rm_col[c];
+                               if (rc->rc_error == 0) {
+                                       zio_bad_cksum_t zbc;
+                                       zbc.zbc_has_cksum = 0;
+                                       zbc.zbc_injected =
+                                           rm->rm_ecksuminjected;
+
+                                       zfs_ereport_start_checksum(
+                                           zio->io_spa,
+                                           vd->vdev_child[rc->rc_devidx],
+                                           zio, rc->rc_offset, rc->rc_size,
+                                           (void *)(uintptr_t)c, &zbc);
                                }
-
-                               bcopy(orig1, rc1->rc_data, rc1->rc_size);
-                               zio_buf_free(orig1, rc1->rc_size);
                        }
-
-                       bcopy(orig, rc->rc_data, rc->rc_size);
-                       zio_buf_free(orig, rc->rc_size);
-               }
-       }
-
-       /*
-        * All combinations failed to checksum. Generate checksum ereports for
-        * all children.
-        */
-       zio->io_error = ECKSUM;
-
-       if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-               for (c = 0; c < rm->rm_cols; c++) {
-                       rc = &rm->rm_col[c];
-                       zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-                           zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
-                           rc->rc_offset, rc->rc_size);
                }
        }
 
 done:
        zio_checksum_verified(zio);
 
-       if (zio->io_error == 0 && (spa_mode & FWRITE) &&
+       if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
            (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
                /*
                 * Use the good data we have in hand to repair damaged children.
@@ -1180,7 +2121,8 @@ done:
                        zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
                            rc->rc_offset, rc->rc_data, rc->rc_size,
                            ZIO_TYPE_WRITE, zio->io_priority,
-                           ZIO_FLAG_IO_REPAIR, NULL, NULL));
+                           ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+                           ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
                }
        }
 }
@@ -1204,6 +2146,8 @@ vdev_ops_t vdev_raidz_ops = {
        vdev_raidz_io_start,
        vdev_raidz_io_done,
        vdev_raidz_state_change,
+       NULL,
+       NULL,
        VDEV_TYPE_RAIDZ,        /* name of this vdev type */
        B_FALSE                 /* not a leaf vdev */
 };