The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/cddl/boot/zfs/zfssubr.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or http://www.opensolaris.org/os/licensing.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
   23  * Use is subject to license terms.
   24  */
   25 
   26 #include <sys/cdefs.h>
   27 __FBSDID("$FreeBSD$");
   28 
   29 #include <lz4.h>
   30 
   31 static uint64_t zfs_crc64_table[256];
   32 
   33 #ifndef ASSERT3S        /* Proxy for all the assert defines */
   34 #define ASSERT3S(x, y, z)       ((void)0)
   35 #define ASSERT3U(x, y, z)       ((void)0)
   36 #define ASSERT3P(x, y, z)       ((void)0)
   37 #define ASSERT0(x)              ((void)0)
   38 #define ASSERT(x)               ((void)0)
   39 #endif
   40 
   41 #define panic(...)      do {                                            \
   42         printf(__VA_ARGS__);                                            \
   43         for (;;) ;                                                      \
   44 } while (0)
   45 
   46 static void
   47 zfs_init_crc(void)
   48 {
   49         int i, j;
   50         uint64_t *ct;
   51 
   52         /*
   53          * Calculate the crc64 table (used for the zap hash
   54          * function).
   55          */
   56         if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
   57                 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
   58                 for (i = 0; i < 256; i++)
   59                         for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
   60                                 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
   61         }
   62 }
   63 
   64 static void
   65 zio_checksum_off(const void *buf, uint64_t size,
   66     const void *ctx_template, zio_cksum_t *zcp)
   67 {
   68         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
   69 }
   70 
   71 /*
   72  * Signature for checksum functions.
   73  */
   74 typedef void zio_checksum_t(const void *data, uint64_t size,
   75     const void *ctx_template, zio_cksum_t *zcp);
   76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
   77 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
   78 
   79 typedef enum zio_checksum_flags {
   80         /* Strong enough for metadata? */
   81         ZCHECKSUM_FLAG_METADATA = (1 << 1),
   82         /* ZIO embedded checksum */
   83         ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
   84         /* Strong enough for dedup (without verification)? */
   85         ZCHECKSUM_FLAG_DEDUP = (1 << 3),
   86         /* Uses salt value */
   87         ZCHECKSUM_FLAG_SALTED = (1 << 4),
   88         /* Strong enough for nopwrite? */
   89         ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
   90 } zio_checksum_flags_t;
   91 
   92 /*
   93  * Information about each checksum function.
   94  */
   95 typedef struct zio_checksum_info {
   96         /* checksum function for each byteorder */
   97         zio_checksum_t                  *ci_func[2];
   98         zio_checksum_tmpl_init_t        *ci_tmpl_init;
   99         zio_checksum_tmpl_free_t        *ci_tmpl_free;
  100         zio_checksum_flags_t            ci_flags;
  101         const char                      *ci_name;       /* descriptive name */
  102 } zio_checksum_info_t;
  103 
  104 #include "blkptr.c"
  105 
  106 #include "fletcher.c"
  107 #include "blake3_zfs.c"
  108 #include "sha256.c"
  109 #include "skein_zfs.c"
  110 
  111 #ifdef HAS_ZSTD_ZFS
  112 extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
  113     size_t d_len, int n);
  114 #endif
  115 
  116 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
  117         {{NULL, NULL}, NULL, NULL, 0, "inherit"},
  118         {{NULL, NULL}, NULL, NULL, 0, "on"},
  119         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL, 0, "off"},
  120         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
  121             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
  122         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
  123             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
  124         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
  125             ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
  126         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
  127             0, "fletcher2"},
  128         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
  129             ZCHECKSUM_FLAG_METADATA, "fletcher4"},
  130         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
  131             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
  132             ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
  133         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
  134             ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
  135         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL,
  136             0, "noparity"},
  137         {{zio_checksum_SHA512_native,   zio_checksum_SHA512_byteswap},
  138             NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
  139             ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
  140         {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
  141             zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
  142             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
  143             ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
  144         /* no edonr for now */
  145         {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
  146             ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
  147         {{zio_checksum_blake3_native,   zio_checksum_blake3_byteswap},
  148             zio_checksum_blake3_tmpl_init, zio_checksum_blake3_tmpl_free,
  149             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
  150             ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"}
  151 };
  152 
  153 /*
  154  * Common signature for all zio compress/decompress functions.
  155  */
  156 typedef size_t zio_compress_func_t(void *src, void *dst,
  157     size_t s_len, size_t d_len, int);
  158 typedef int zio_decompress_func_t(void *src, void *dst,
  159     size_t s_len, size_t d_len, int);
  160 
  161 /*
  162  * Information about each compression function.
  163  */
  164 typedef struct zio_compress_info {
  165         zio_compress_func_t     *ci_compress;   /* compression function */
  166         zio_decompress_func_t   *ci_decompress; /* decompression function */
  167         int                     ci_level;       /* level parameter */
  168         const char              *ci_name;       /* algorithm name */
  169 } zio_compress_info_t;
  170 
  171 #include "lzjb.c"
  172 #include "zle.c"
  173 #include "gzip.c"
  174 
  175 /*
  176  * Compression vectors.
  177  */
  178 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
  179         {NULL,                  NULL,                   0,      "inherit"},
  180         {NULL,                  NULL,                   0,      "on"},
  181         {NULL,                  NULL,                   0,      "uncompressed"},
  182         {NULL,                  lzjb_decompress,        0,      "lzjb"},
  183         {NULL,                  NULL,                   0,      "empty"},
  184         {NULL,                  gzip_decompress,        1,      "gzip-1"},
  185         {NULL,                  gzip_decompress,        2,      "gzip-2"},
  186         {NULL,                  gzip_decompress,        3,      "gzip-3"},
  187         {NULL,                  gzip_decompress,        4,      "gzip-4"},
  188         {NULL,                  gzip_decompress,        5,      "gzip-5"},
  189         {NULL,                  gzip_decompress,        6,      "gzip-6"},
  190         {NULL,                  gzip_decompress,        7,      "gzip-7"},
  191         {NULL,                  gzip_decompress,        8,      "gzip-8"},
  192         {NULL,                  gzip_decompress,        9,      "gzip-9"},
  193         {NULL,                  zle_decompress,         64,     "zle"},
  194         {NULL,                  lz4_decompress,         0,      "lz4"},
  195 #ifdef HAS_ZSTD_ZFS
  196         {NULL,                  zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
  197 #endif
  198 };
  199 
  200 static void
  201 byteswap_uint64_array(void *vbuf, size_t size)
  202 {
  203         uint64_t *buf = vbuf;
  204         size_t count = size >> 3;
  205         int i;
  206 
  207         ASSERT((size & 7) == 0);
  208 
  209         for (i = 0; i < count; i++)
  210                 buf[i] = BSWAP_64(buf[i]);
  211 }
  212 
  213 /*
  214  * Set the external verifier for a gang block based on <vdev, offset, txg>,
  215  * a tuple which is guaranteed to be unique for the life of the pool.
  216  */
  217 static void
  218 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
  219 {
  220         const dva_t *dva = BP_IDENTITY(bp);
  221         uint64_t txg = BP_PHYSICAL_BIRTH(bp);
  222 
  223         ASSERT(BP_IS_GANG(bp));
  224 
  225         ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
  226 }
  227 
  228 /*
  229  * Set the external verifier for a label block based on its offset.
  230  * The vdev is implicit, and the txg is unknowable at pool open time --
  231  * hence the logic in vdev_uberblock_load() to find the most recent copy.
  232  */
  233 static void
  234 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
  235 {
  236         ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
  237 }
  238 
  239 /*
  240  * Calls the template init function of a checksum which supports context
  241  * templates and installs the template into the spa_t.
  242  */
  243 static void
  244 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
  245 {
  246         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
  247 
  248         if (ci->ci_tmpl_init == NULL)
  249                 return;
  250 
  251         if (spa->spa_cksum_tmpls[checksum] != NULL)
  252                 return;
  253 
  254         if (spa->spa_cksum_tmpls[checksum] == NULL) {
  255                 spa->spa_cksum_tmpls[checksum] =
  256                     ci->ci_tmpl_init(&spa->spa_cksum_salt);
  257         }
  258 }
  259 
  260 /*
  261  * Called by a spa_t that's about to be deallocated. This steps through
  262  * all of the checksum context templates and deallocates any that were
  263  * initialized using the algorithm-specific template init function.
  264  */
  265 static void __unused
  266 zio_checksum_templates_free(spa_t *spa)
  267 {
  268         for (enum zio_checksum checksum = 0;
  269             checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
  270                 if (spa->spa_cksum_tmpls[checksum] != NULL) {
  271                         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
  272 
  273                         ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
  274                         spa->spa_cksum_tmpls[checksum] = NULL;
  275                 }
  276         }
  277 }
  278 
  279 static int
  280 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
  281 {
  282         uint64_t size;
  283         unsigned int checksum;
  284         zio_checksum_info_t *ci;
  285         void *ctx = NULL;
  286         zio_cksum_t actual_cksum, expected_cksum, verifier;
  287         int byteswap;
  288 
  289         checksum = BP_GET_CHECKSUM(bp);
  290         size = BP_GET_PSIZE(bp);
  291 
  292         if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
  293                 return (EINVAL);
  294         ci = &zio_checksum_table[checksum];
  295         if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
  296                 return (EINVAL);
  297 
  298         if (spa != NULL) {
  299                 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
  300                 ctx = spa->spa_cksum_tmpls[checksum];
  301         }
  302 
  303         if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
  304                 zio_eck_t *eck;
  305 
  306                 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
  307                     checksum == ZIO_CHECKSUM_LABEL);
  308 
  309                 eck = (zio_eck_t *)((char *)data + size) - 1;
  310 
  311                 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
  312                         zio_checksum_gang_verifier(&verifier, bp);
  313                 else if (checksum == ZIO_CHECKSUM_LABEL)
  314                         zio_checksum_label_verifier(&verifier,
  315                             DVA_GET_OFFSET(BP_IDENTITY(bp)));
  316                 else
  317                         verifier = bp->blk_cksum;
  318 
  319                 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
  320 
  321                 if (byteswap)
  322                         byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
  323 
  324                 expected_cksum = eck->zec_cksum;
  325                 eck->zec_cksum = verifier;
  326                 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
  327                 eck->zec_cksum = expected_cksum;
  328 
  329                 if (byteswap)
  330                         byteswap_uint64_array(&expected_cksum,
  331                             sizeof (zio_cksum_t));
  332         } else {
  333                 byteswap = BP_SHOULD_BYTESWAP(bp);
  334                 expected_cksum = bp->blk_cksum;
  335                 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
  336         }
  337 
  338         if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
  339                 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
  340                 return (EIO);
  341         }
  342 
  343         return (0);
  344 }
  345 
  346 static int
  347 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
  348         void *dest, uint64_t destsize)
  349 {
  350         zio_compress_info_t *ci;
  351 
  352         if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
  353                 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
  354                 return (EIO);
  355         }
  356 
  357         ci = &zio_compress_table[cpfunc];
  358         if (!ci->ci_decompress) {
  359                 printf("ZFS: unsupported compression algorithm %s\n",
  360                     ci->ci_name);
  361                 return (EIO);
  362         }
  363 
  364         return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
  365 }
  366 
  367 static uint64_t
  368 zap_hash(uint64_t salt, const char *name)
  369 {
  370         const uint8_t *cp;
  371         uint8_t c;
  372         uint64_t crc = salt;
  373 
  374         ASSERT(crc != 0);
  375         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
  376         for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
  377                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
  378 
  379         /*
  380          * Only use 28 bits, since we need 4 bits in the cookie for the
  381          * collision differentiator.  We MUST use the high bits, since
  382          * those are the onces that we first pay attention to when
  383          * chosing the bucket.
  384          */
  385         crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
  386 
  387         return (crc);
  388 }
  389 
  390 typedef struct raidz_col {
  391         uint64_t rc_devidx;             /* child device index for I/O */
  392         uint64_t rc_offset;             /* device offset */
  393         uint64_t rc_size;               /* I/O size */
  394         void *rc_data;                  /* I/O data */
  395         int rc_error;                   /* I/O error for this device */
  396         uint8_t rc_tried;               /* Did we attempt this I/O column? */
  397         uint8_t rc_skipped;             /* Did we skip this I/O column? */
  398 } raidz_col_t;
  399 
  400 typedef struct raidz_map {
  401         uint64_t rm_cols;               /* Regular column count */
  402         uint64_t rm_scols;              /* Count including skipped columns */
  403         uint64_t rm_bigcols;            /* Number of oversized columns */
  404         uint64_t rm_asize;              /* Actual total I/O size */
  405         uint64_t rm_missingdata;        /* Count of missing data devices */
  406         uint64_t rm_missingparity;      /* Count of missing parity devices */
  407         uint64_t rm_firstdatacol;       /* First data column/parity count */
  408         uint64_t rm_nskip;              /* Skipped sectors for padding */
  409         uint64_t rm_skipstart;          /* Column index of padding start */
  410         uintptr_t rm_reports;           /* # of referencing checksum reports */
  411         uint8_t rm_freed;               /* map no longer has referencing ZIO */
  412         uint8_t rm_ecksuminjected;      /* checksum error was injected */
  413         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
  414 } raidz_map_t;
  415 
  416 #define VDEV_RAIDZ_P            0
  417 #define VDEV_RAIDZ_Q            1
  418 #define VDEV_RAIDZ_R            2
  419 
  420 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
  421 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
  422 
  423 /*
  424  * We provide a mechanism to perform the field multiplication operation on a
  425  * 64-bit value all at once rather than a byte at a time. This works by
  426  * creating a mask from the top bit in each byte and using that to
  427  * conditionally apply the XOR of 0x1d.
  428  */
  429 #define VDEV_RAIDZ_64MUL_2(x, mask) \
  430 { \
  431         (mask) = (x) & 0x8080808080808080ULL; \
  432         (mask) = ((mask) << 1) - ((mask) >> 7); \
  433         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
  434             ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
  435 }
  436 
  437 #define VDEV_RAIDZ_64MUL_4(x, mask) \
  438 { \
  439         VDEV_RAIDZ_64MUL_2((x), mask); \
  440         VDEV_RAIDZ_64MUL_2((x), mask); \
  441 }
  442 
  443 /*
  444  * These two tables represent powers and logs of 2 in the Galois field defined
  445  * above. These values were computed by repeatedly multiplying by 2 as above.
  446  */
  447 static const uint8_t vdev_raidz_pow2[256] = {
  448         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
  449         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
  450         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
  451         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
  452         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
  453         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
  454         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
  455         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
  456         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
  457         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
  458         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
  459         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
  460         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
  461         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
  462         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
  463         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
  464         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
  465         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
  466         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
  467         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
  468         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
  469         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
  470         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
  471         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
  472         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
  473         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
  474         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
  475         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
  476         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
  477         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
  478         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
  479         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
  480 };
  481 static const uint8_t vdev_raidz_log2[256] = {
  482         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
  483         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
  484         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
  485         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
  486         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
  487         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
  488         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
  489         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
  490         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
  491         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
  492         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
  493         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
  494         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
  495         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
  496         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
  497         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
  498         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
  499         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
  500         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
  501         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
  502         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
  503         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
  504         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
  505         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
  506         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
  507         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
  508         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
  509         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
  510         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
  511         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
  512         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
  513         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
  514 };
  515 
  516 /*
  517  * Multiply a given number by 2 raised to the given power.
  518  */
  519 static uint8_t
  520 vdev_raidz_exp2(uint8_t a, int exp)
  521 {
  522         if (a == 0)
  523                 return (0);
  524 
  525         ASSERT(exp >= 0);
  526         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
  527 
  528         exp += vdev_raidz_log2[a];
  529         if (exp > 255)
  530                 exp -= 255;
  531 
  532         return (vdev_raidz_pow2[exp]);
  533 }
  534 
  535 static void
  536 vdev_raidz_generate_parity_p(raidz_map_t *rm)
  537 {
  538         uint64_t *p, *src, ccount, i;
  539         uint64_t pcount __unused;
  540         int c;
  541 
  542         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
  543 
  544         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
  545                 src = rm->rm_col[c].rc_data;
  546                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
  547                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
  548 
  549                 if (c == rm->rm_firstdatacol) {
  550                         ASSERT(ccount == pcount);
  551                         for (i = 0; i < ccount; i++, src++, p++) {
  552                                 *p = *src;
  553                         }
  554                 } else {
  555                         ASSERT(ccount <= pcount);
  556                         for (i = 0; i < ccount; i++, src++, p++) {
  557                                 *p ^= *src;
  558                         }
  559                 }
  560         }
  561 }
  562 
  563 static void
  564 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
  565 {
  566         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
  567         int c;
  568 
  569         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
  570         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
  571             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
  572 
  573         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
  574                 src = rm->rm_col[c].rc_data;
  575                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
  576                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
  577 
  578                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
  579 
  580                 if (c == rm->rm_firstdatacol) {
  581                         ASSERT(ccnt == pcnt || ccnt == 0);
  582                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
  583                                 *p = *src;
  584                                 *q = *src;
  585                         }
  586                         for (; i < pcnt; i++, src++, p++, q++) {
  587                                 *p = 0;
  588                                 *q = 0;
  589                         }
  590                 } else {
  591                         ASSERT(ccnt <= pcnt);
  592 
  593                         /*
  594                          * Apply the algorithm described above by multiplying
  595                          * the previous result and adding in the new value.
  596                          */
  597                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
  598                                 *p ^= *src;
  599 
  600                                 VDEV_RAIDZ_64MUL_2(*q, mask);
  601                                 *q ^= *src;
  602                         }
  603 
  604                         /*
  605                          * Treat short columns as though they are full of 0s.
  606                          * Note that there's therefore nothing needed for P.
  607                          */
  608                         for (; i < pcnt; i++, q++) {
  609                                 VDEV_RAIDZ_64MUL_2(*q, mask);
  610                         }
  611                 }
  612         }
  613 }
  614 
  615 static void
  616 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
  617 {
  618         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
  619         int c;
  620 
  621         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
  622         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
  623             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
  624         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
  625             rm->rm_col[VDEV_RAIDZ_R].rc_size);
  626 
  627         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
  628                 src = rm->rm_col[c].rc_data;
  629                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
  630                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
  631                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
  632 
  633                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
  634 
  635                 if (c == rm->rm_firstdatacol) {
  636                         ASSERT(ccnt == pcnt || ccnt == 0);
  637                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
  638                                 *p = *src;
  639                                 *q = *src;
  640                                 *r = *src;
  641                         }
  642                         for (; i < pcnt; i++, src++, p++, q++, r++) {
  643                                 *p = 0;
  644                                 *q = 0;
  645                                 *r = 0;
  646                         }
  647                 } else {
  648                         ASSERT(ccnt <= pcnt);
  649 
  650                         /*
  651                          * Apply the algorithm described above by multiplying
  652                          * the previous result and adding in the new value.
  653                          */
  654                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
  655                                 *p ^= *src;
  656 
  657                                 VDEV_RAIDZ_64MUL_2(*q, mask);
  658                                 *q ^= *src;
  659 
  660                                 VDEV_RAIDZ_64MUL_4(*r, mask);
  661                                 *r ^= *src;
  662                         }
  663 
  664                         /*
  665                          * Treat short columns as though they are full of 0s.
  666                          * Note that there's therefore nothing needed for P.
  667                          */
  668                         for (; i < pcnt; i++, q++, r++) {
  669                                 VDEV_RAIDZ_64MUL_2(*q, mask);
  670                                 VDEV_RAIDZ_64MUL_4(*r, mask);
  671                         }
  672                 }
  673         }
  674 }
  675 
  676 /*
  677  * Generate RAID parity in the first virtual columns according to the number of
  678  * parity columns available.
  679  */
  680 static void
  681 vdev_raidz_generate_parity(raidz_map_t *rm)
  682 {
  683         switch (rm->rm_firstdatacol) {
  684         case 1:
  685                 vdev_raidz_generate_parity_p(rm);
  686                 break;
  687         case 2:
  688                 vdev_raidz_generate_parity_pq(rm);
  689                 break;
  690         case 3:
  691                 vdev_raidz_generate_parity_pqr(rm);
  692                 break;
  693         default:
  694                 panic("invalid RAID-Z configuration");
  695         }
  696 }
  697 
  698 /* BEGIN CSTYLED */
  699 /*
  700  * In the general case of reconstruction, we must solve the system of linear
  701  * equations defined by the coeffecients used to generate parity as well as
  702  * the contents of the data and parity disks. This can be expressed with
  703  * vectors for the original data (D) and the actual data (d) and parity (p)
  704  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
  705  *
  706  *            __   __                     __     __
  707  *            |     |         __     __   |  p_0  |
  708  *            |  V  |         |  D_0  |   | p_m-1 |
  709  *            |     |    x    |   :   | = |  d_0  |
  710  *            |  I  |         | D_n-1 |   |   :   |
  711  *            |     |         ~~     ~~   | d_n-1 |
  712  *            ~~   ~~                     ~~     ~~
  713  *
  714  * I is simply a square identity matrix of size n, and V is a vandermonde
  715  * matrix defined by the coeffecients we chose for the various parity columns
  716  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
  717  * computation as well as linear separability.
  718  *
  719  *      __               __               __     __
  720  *      |   1   ..  1 1 1 |               |  p_0  |
  721  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
  722  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
  723  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
  724  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
  725  *      |   :       : : : |   |   :   |   |  d_2  |
  726  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
  727  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
  728  *      |   0   ..  0 0 1 |               | d_n-1 |
  729  *      ~~               ~~               ~~     ~~
  730  *
  731  * Note that I, V, d, and p are known. To compute D, we must invert the
  732  * matrix and use the known data and parity values to reconstruct the unknown
  733  * data values. We begin by removing the rows in V|I and d|p that correspond
  734  * to failed or missing columns; we then make V|I square (n x n) and d|p
  735  * sized n by removing rows corresponding to unused parity from the bottom up
  736  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
  737  * using Gauss-Jordan elimination. In the example below we use m=3 parity
  738  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
  739  *           __                               __
  740  *           |  1   1   1   1   1   1   1   1  |
  741  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
  742  *           |  19 205 116  29  64  16  4   1  |      / /
  743  *           |  1   0   0   0   0   0   0   0  |     / /
  744  *           |  0   1   0   0   0   0   0   0  | <--' /
  745  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
  746  *           |  0   0   0   1   0   0   0   0  |
  747  *           |  0   0   0   0   1   0   0   0  |
  748  *           |  0   0   0   0   0   1   0   0  |
  749  *           |  0   0   0   0   0   0   1   0  |
  750  *           |  0   0   0   0   0   0   0   1  |
  751  *           ~~                               ~~
  752  *           __                               __
  753  *           |  1   1   1   1   1   1   1   1  |
  754  *           | 128  64  32  16  8   4   2   1  |
  755  *           |  19 205 116  29  64  16  4   1  |
  756  *           |  1   0   0   0   0   0   0   0  |
  757  *           |  0   1   0   0   0   0   0   0  |
  758  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
  759  *           |  0   0   0   1   0   0   0   0  |
  760  *           |  0   0   0   0   1   0   0   0  |
  761  *           |  0   0   0   0   0   1   0   0  |
  762  *           |  0   0   0   0   0   0   1   0  |
  763  *           |  0   0   0   0   0   0   0   1  |
  764  *           ~~                               ~~
  765  *
  766  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
  767  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
  768  * matrix is not singular.
  769  * __                                                                 __
  770  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  771  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  772  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  773  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  774  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  775  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  776  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  777  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  778  * ~~                                                                 ~~
  779  * __                                                                 __
  780  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  781  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  782  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  783  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  784  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  785  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  786  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  787  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  788  * ~~                                                                 ~~
  789  * __                                                                 __
  790  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  791  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  792  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
  793  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  794  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  795  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  796  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  797  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  798  * ~~                                                                 ~~
  799  * __                                                                 __
  800  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  801  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  802  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
  803  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  804  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  805  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  806  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  807  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  808  * ~~                                                                 ~~
  809  * __                                                                 __
  810  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  811  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  812  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  813  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  814  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  815  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  816  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  817  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  818  * ~~                                                                 ~~
  819  * __                                                                 __
  820  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  821  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
  822  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  823  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  824  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  825  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  826  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  827  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  828  * ~~                                                                 ~~
  829  *                   __                               __
  830  *                   |  0   0   1   0   0   0   0   0  |
  831  *                   | 167 100  5   41 159 169 217 208 |
  832  *                   | 166 100  4   40 158 168 216 209 |
  833  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
  834  *                   |  0   0   0   0   1   0   0   0  |
  835  *                   |  0   0   0   0   0   1   0   0  |
  836  *                   |  0   0   0   0   0   0   1   0  |
  837  *                   |  0   0   0   0   0   0   0   1  |
  838  *                   ~~                               ~~
  839  *
  840  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
  841  * of the missing data.
  842  *
  843  * As is apparent from the example above, the only non-trivial rows in the
  844  * inverse matrix correspond to the data disks that we're trying to
  845  * reconstruct. Indeed, those are the only rows we need as the others would
  846  * only be useful for reconstructing data known or assumed to be valid. For
  847  * that reason, we only build the coefficients in the rows that correspond to
  848  * targeted columns.
  849  */
  850 /* END CSTYLED */
  851 
  852 static void
  853 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
  854     uint8_t **rows)
  855 {
  856         int i, j;
  857         int pow;
  858 
  859         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
  860 
  861         /*
  862          * Fill in the missing rows of interest.
  863          */
  864         for (i = 0; i < nmap; i++) {
  865                 ASSERT3S(0, <=, map[i]);
  866                 ASSERT3S(map[i], <=, 2);
  867 
  868                 pow = map[i] * n;
  869                 if (pow > 255)
  870                         pow -= 255;
  871                 ASSERT(pow <= 255);
  872 
  873                 for (j = 0; j < n; j++) {
  874                         pow -= map[i];
  875                         if (pow < 0)
  876                                 pow += 255;
  877                         rows[i][j] = vdev_raidz_pow2[pow];
  878                 }
  879         }
  880 }
  881 
  882 static void
  883 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
  884     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
  885 {
  886         int i, j, ii, jj;
  887         uint8_t log;
  888 
  889         /*
  890          * Assert that the first nmissing entries from the array of used
  891          * columns correspond to parity columns and that subsequent entries
  892          * correspond to data columns.
  893          */
  894         for (i = 0; i < nmissing; i++) {
  895                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
  896         }
  897         for (; i < n; i++) {
  898                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
  899         }
  900 
  901         /*
  902          * First initialize the storage where we'll compute the inverse rows.
  903          */
  904         for (i = 0; i < nmissing; i++) {
  905                 for (j = 0; j < n; j++) {
  906                         invrows[i][j] = (i == j) ? 1 : 0;
  907                 }
  908         }
  909 
  910         /*
  911          * Subtract all trivial rows from the rows of consequence.
  912          */
  913         for (i = 0; i < nmissing; i++) {
  914                 for (j = nmissing; j < n; j++) {
  915                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
  916                         jj = used[j] - rm->rm_firstdatacol;
  917                         ASSERT3S(jj, <, n);
  918                         invrows[i][j] = rows[i][jj];
  919                         rows[i][jj] = 0;
  920                 }
  921         }
  922 
  923         /*
  924          * For each of the rows of interest, we must normalize it and subtract
  925          * a multiple of it from the other rows.
  926          */
  927         for (i = 0; i < nmissing; i++) {
  928                 for (j = 0; j < missing[i]; j++) {
  929                         ASSERT3U(rows[i][j], ==, 0);
  930                 }
  931                 ASSERT3U(rows[i][missing[i]], !=, 0);
  932 
  933                 /*
  934                  * Compute the inverse of the first element and multiply each
  935                  * element in the row by that value.
  936                  */
  937                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
  938 
  939                 for (j = 0; j < n; j++) {
  940                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
  941                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
  942                 }
  943 
  944                 for (ii = 0; ii < nmissing; ii++) {
  945                         if (i == ii)
  946                                 continue;
  947 
  948                         ASSERT3U(rows[ii][missing[i]], !=, 0);
  949 
  950                         log = vdev_raidz_log2[rows[ii][missing[i]]];
  951 
  952                         for (j = 0; j < n; j++) {
  953                                 rows[ii][j] ^=
  954                                     vdev_raidz_exp2(rows[i][j], log);
  955                                 invrows[ii][j] ^=
  956                                     vdev_raidz_exp2(invrows[i][j], log);
  957                         }
  958                 }
  959         }
  960 
  961         /*
  962          * Verify that the data that is left in the rows are properly part of
  963          * an identity matrix.
  964          */
  965         for (i = 0; i < nmissing; i++) {
  966                 for (j = 0; j < n; j++) {
  967                         if (j == missing[i]) {
  968                                 ASSERT3U(rows[i][j], ==, 1);
  969                         } else {
  970                                 ASSERT3U(rows[i][j], ==, 0);
  971                         }
  972                 }
  973         }
  974 }
  975 
  976 static void
  977 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
  978     int *missing, uint8_t **invrows, const uint8_t *used)
  979 {
  980         int i, j, x, cc, c;
  981         uint8_t *src;
  982         uint64_t ccount;
  983         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
  984         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
  985         uint8_t log, val;
  986         int ll;
  987         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
  988         uint8_t *p, *pp;
  989         size_t psize;
  990 
  991         log = 0;        /* gcc */
  992         psize = sizeof (invlog[0][0]) * n * nmissing;
  993         p = malloc(psize);
  994         if (p == NULL) {
  995                 printf("Out of memory\n");
  996                 return;
  997         }
  998 
  999         for (pp = p, i = 0; i < nmissing; i++) {
 1000                 invlog[i] = pp;
 1001                 pp += n;
 1002         }
 1003 
 1004         for (i = 0; i < nmissing; i++) {
 1005                 for (j = 0; j < n; j++) {
 1006                         ASSERT3U(invrows[i][j], !=, 0);
 1007                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
 1008                 }
 1009         }
 1010 
 1011         for (i = 0; i < n; i++) {
 1012                 c = used[i];
 1013                 ASSERT3U(c, <, rm->rm_cols);
 1014 
 1015                 src = rm->rm_col[c].rc_data;
 1016                 ccount = rm->rm_col[c].rc_size;
 1017                 for (j = 0; j < nmissing; j++) {
 1018                         cc = missing[j] + rm->rm_firstdatacol;
 1019                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
 1020                         ASSERT3U(cc, <, rm->rm_cols);
 1021                         ASSERT3U(cc, !=, c);
 1022 
 1023                         dst[j] = rm->rm_col[cc].rc_data;
 1024                         dcount[j] = rm->rm_col[cc].rc_size;
 1025                 }
 1026 
 1027                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
 1028 
 1029                 for (x = 0; x < ccount; x++, src++) {
 1030                         if (*src != 0)
 1031                                 log = vdev_raidz_log2[*src];
 1032 
 1033                         for (cc = 0; cc < nmissing; cc++) {
 1034                                 if (x >= dcount[cc])
 1035                                         continue;
 1036 
 1037                                 if (*src == 0) {
 1038                                         val = 0;
 1039                                 } else {
 1040                                         if ((ll = log + invlog[cc][i]) >= 255)
 1041                                                 ll -= 255;
 1042                                         val = vdev_raidz_pow2[ll];
 1043                                 }
 1044 
 1045                                 if (i == 0)
 1046                                         dst[cc][x] = val;
 1047                                 else
 1048                                         dst[cc][x] ^= val;
 1049                         }
 1050                 }
 1051         }
 1052 
 1053         free(p);
 1054 }
 1055 
 1056 static int
 1057 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
 1058 {
 1059         int n, i, c, t, tt;
 1060         int nmissing_rows;
 1061         int missing_rows[VDEV_RAIDZ_MAXPARITY];
 1062         int parity_map[VDEV_RAIDZ_MAXPARITY];
 1063 
 1064         uint8_t *p, *pp;
 1065         size_t psize;
 1066 
 1067         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
 1068         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
 1069         uint8_t *used;
 1070 
 1071         int code = 0;
 1072 
 1073 
 1074         n = rm->rm_cols - rm->rm_firstdatacol;
 1075 
 1076         /*
 1077          * Figure out which data columns are missing.
 1078          */
 1079         nmissing_rows = 0;
 1080         for (t = 0; t < ntgts; t++) {
 1081                 if (tgts[t] >= rm->rm_firstdatacol) {
 1082                         missing_rows[nmissing_rows++] =
 1083                             tgts[t] - rm->rm_firstdatacol;
 1084                 }
 1085         }
 1086 
 1087         /*
 1088          * Figure out which parity columns to use to help generate the missing
 1089          * data columns.
 1090          */
 1091         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
 1092                 ASSERT(tt < ntgts);
 1093                 ASSERT(c < rm->rm_firstdatacol);
 1094 
 1095                 /*
 1096                  * Skip any targeted parity columns.
 1097                  */
 1098                 if (c == tgts[tt]) {
 1099                         tt++;
 1100                         continue;
 1101                 }
 1102 
 1103                 code |= 1 << c;
 1104 
 1105                 parity_map[i] = c;
 1106                 i++;
 1107         }
 1108 
 1109         ASSERT(code != 0);
 1110         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
 1111 
 1112         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
 1113             nmissing_rows * n + sizeof (used[0]) * n;
 1114         p = malloc(psize);
 1115         if (p == NULL) {
 1116                 printf("Out of memory\n");
 1117                 return (code);
 1118         }
 1119 
 1120         for (pp = p, i = 0; i < nmissing_rows; i++) {
 1121                 rows[i] = pp;
 1122                 pp += n;
 1123                 invrows[i] = pp;
 1124                 pp += n;
 1125         }
 1126         used = pp;
 1127 
 1128         for (i = 0; i < nmissing_rows; i++) {
 1129                 used[i] = parity_map[i];
 1130         }
 1131 
 1132         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 1133                 if (tt < nmissing_rows &&
 1134                     c == missing_rows[tt] + rm->rm_firstdatacol) {
 1135                         tt++;
 1136                         continue;
 1137                 }
 1138 
 1139                 ASSERT3S(i, <, n);
 1140                 used[i] = c;
 1141                 i++;
 1142         }
 1143 
 1144         /*
 1145          * Initialize the interesting rows of the matrix.
 1146          */
 1147         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
 1148 
 1149         /*
 1150          * Invert the matrix.
 1151          */
 1152         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
 1153             invrows, used);
 1154 
 1155         /*
 1156          * Reconstruct the missing data using the generated matrix.
 1157          */
 1158         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
 1159             invrows, used);
 1160 
 1161         free(p);
 1162 
 1163         return (code);
 1164 }
 1165 
 1166 static int
 1167 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
 1168 {
 1169         int tgts[VDEV_RAIDZ_MAXPARITY];
 1170         int ntgts;
 1171         int i, c;
 1172         int code;
 1173         int nbadparity, nbaddata;
 1174 
 1175         /*
 1176          * The tgts list must already be sorted.
 1177          */
 1178         for (i = 1; i < nt; i++) {
 1179                 ASSERT(t[i] > t[i - 1]);
 1180         }
 1181 
 1182         nbadparity = rm->rm_firstdatacol;
 1183         nbaddata = rm->rm_cols - nbadparity;
 1184         ntgts = 0;
 1185         for (i = 0, c = 0; c < rm->rm_cols; c++) {
 1186                 if (i < nt && c == t[i]) {
 1187                         tgts[ntgts++] = c;
 1188                         i++;
 1189                 } else if (rm->rm_col[c].rc_error != 0) {
 1190                         tgts[ntgts++] = c;
 1191                 } else if (c >= rm->rm_firstdatacol) {
 1192                         nbaddata--;
 1193                 } else {
 1194                         nbadparity--;
 1195                 }
 1196         }
 1197 
 1198         ASSERT(ntgts >= nt);
 1199         ASSERT(nbaddata >= 0);
 1200         ASSERT(nbaddata + nbadparity == ntgts);
 1201 
 1202         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
 1203         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
 1204         ASSERT(code > 0);
 1205         return (code);
 1206 }
 1207 
 1208 static raidz_map_t *
 1209 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
 1210     uint64_t dcols, uint64_t nparity)
 1211 {
 1212         raidz_map_t *rm;
 1213         uint64_t b = offset >> unit_shift;
 1214         uint64_t s = size >> unit_shift;
 1215         uint64_t f = b % dcols;
 1216         uint64_t o = (b / dcols) << unit_shift;
 1217         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 1218 
 1219         q = s / (dcols - nparity);
 1220         r = s - q * (dcols - nparity);
 1221         bc = (r == 0 ? 0 : r + nparity);
 1222         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 1223 
 1224         if (q == 0) {
 1225                 acols = bc;
 1226                 scols = MIN(dcols, roundup(bc, nparity + 1));
 1227         } else {
 1228                 acols = dcols;
 1229                 scols = dcols;
 1230         }
 1231 
 1232         ASSERT3U(acols, <=, scols);
 1233 
 1234         rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
 1235         if (rm == NULL)
 1236                 return (rm);
 1237 
 1238         rm->rm_cols = acols;
 1239         rm->rm_scols = scols;
 1240         rm->rm_bigcols = bc;
 1241         rm->rm_skipstart = bc;
 1242         rm->rm_missingdata = 0;
 1243         rm->rm_missingparity = 0;
 1244         rm->rm_firstdatacol = nparity;
 1245         rm->rm_reports = 0;
 1246         rm->rm_freed = 0;
 1247         rm->rm_ecksuminjected = 0;
 1248 
 1249         asize = 0;
 1250 
 1251         for (c = 0; c < scols; c++) {
 1252                 col = f + c;
 1253                 coff = o;
 1254                 if (col >= dcols) {
 1255                         col -= dcols;
 1256                         coff += 1ULL << unit_shift;
 1257                 }
 1258                 rm->rm_col[c].rc_devidx = col;
 1259                 rm->rm_col[c].rc_offset = coff;
 1260                 rm->rm_col[c].rc_data = NULL;
 1261                 rm->rm_col[c].rc_error = 0;
 1262                 rm->rm_col[c].rc_tried = 0;
 1263                 rm->rm_col[c].rc_skipped = 0;
 1264 
 1265                 if (c >= acols)
 1266                         rm->rm_col[c].rc_size = 0;
 1267                 else if (c < bc)
 1268                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 1269                 else
 1270                         rm->rm_col[c].rc_size = q << unit_shift;
 1271 
 1272                 asize += rm->rm_col[c].rc_size;
 1273         }
 1274 
 1275         ASSERT3U(asize, ==, tot << unit_shift);
 1276         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 1277         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 1278         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 1279         ASSERT3U(rm->rm_nskip, <=, nparity);
 1280 
 1281         for (c = 0; c < rm->rm_firstdatacol; c++) {
 1282                 rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
 1283                 if (rm->rm_col[c].rc_data == NULL) {
 1284                         c++;
 1285                         while (c != 0)
 1286                                 free(rm->rm_col[--c].rc_data);
 1287                         free(rm);
 1288                         return (NULL);
 1289                 }
 1290         }
 1291 
 1292         rm->rm_col[c].rc_data = data;
 1293 
 1294         for (c = c + 1; c < acols; c++)
 1295                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
 1296                     rm->rm_col[c - 1].rc_size;
 1297 
 1298         /*
 1299          * If all data stored spans all columns, there's a danger that parity
 1300          * will always be on the same device and, since parity isn't read
 1301          * during normal operation, that that device's I/O bandwidth won't be
 1302          * used effectively. We therefore switch the parity every 1MB.
 1303          *
 1304          * ... at least that was, ostensibly, the theory. As a practical
 1305          * matter unless we juggle the parity between all devices evenly, we
 1306          * won't see any benefit. Further, occasional writes that aren't a
 1307          * multiple of the LCM of the number of children and the minimum
 1308          * stripe width are sufficient to avoid pessimal behavior.
 1309          * Unfortunately, this decision created an implicit on-disk format
 1310          * requirement that we need to support for all eternity, but only
 1311          * for single-parity RAID-Z.
 1312          *
 1313          * If we intend to skip a sector in the zeroth column for padding
 1314          * we must make sure to note this swap. We will never intend to
 1315          * skip the first column since at least one data and one parity
 1316          * column must appear in each row.
 1317          */
 1318         ASSERT(rm->rm_cols >= 2);
 1319         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 1320 
 1321         if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
 1322                 devidx = rm->rm_col[0].rc_devidx;
 1323                 o = rm->rm_col[0].rc_offset;
 1324                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 1325                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 1326                 rm->rm_col[1].rc_devidx = devidx;
 1327                 rm->rm_col[1].rc_offset = o;
 1328 
 1329                 if (rm->rm_skipstart == 0)
 1330                         rm->rm_skipstart = 1;
 1331         }
 1332 
 1333         return (rm);
 1334 }
 1335 
 1336 static void
 1337 vdev_raidz_map_free(raidz_map_t *rm)
 1338 {
 1339         int c;
 1340 
 1341         for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
 1342                 free(rm->rm_col[c].rc_data);
 1343 
 1344         free(rm);
 1345 }
 1346 
 1347 static vdev_t *
 1348 vdev_child(vdev_t *pvd, uint64_t devidx)
 1349 {
 1350         vdev_t *cvd;
 1351 
 1352         STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
 1353                 if (cvd->v_id == devidx)
 1354                         break;
 1355         }
 1356 
 1357         return (cvd);
 1358 }
 1359 
 1360 /*
 1361  * We keep track of whether or not there were any injected errors, so that
 1362  * any ereports we generate can note it.
 1363  */
 1364 static int
 1365 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
 1366     uint64_t size)
 1367 {
 1368         return (zio_checksum_verify(spa, bp, data));
 1369 }
 1370 
 1371 /*
 1372  * Generate the parity from the data columns. If we tried and were able to
 1373  * read the parity without error, verify that the generated parity matches the
 1374  * data we read. If it doesn't, we fire off a checksum error. Return the
 1375  * number such failures.
 1376  */
 1377 static int
 1378 raidz_parity_verify(raidz_map_t *rm)
 1379 {
 1380         void *orig[VDEV_RAIDZ_MAXPARITY];
 1381         int c, ret = 0;
 1382         raidz_col_t *rc;
 1383 
 1384         for (c = 0; c < rm->rm_firstdatacol; c++) {
 1385                 rc = &rm->rm_col[c];
 1386                 if (!rc->rc_tried || rc->rc_error != 0)
 1387                         continue;
 1388                 orig[c] = malloc(rc->rc_size);
 1389                 if (orig[c] != NULL) {
 1390                         bcopy(rc->rc_data, orig[c], rc->rc_size);
 1391                 } else {
 1392                         printf("Out of memory\n");
 1393                 }
 1394         }
 1395 
 1396         vdev_raidz_generate_parity(rm);
 1397 
 1398         for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
 1399                 rc = &rm->rm_col[c];
 1400                 if (!rc->rc_tried || rc->rc_error != 0)
 1401                         continue;
 1402                 if (orig[c] == NULL ||
 1403                     bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
 1404                         rc->rc_error = ECKSUM;
 1405                         ret++;
 1406                 }
 1407                 free(orig[c]);
 1408         }
 1409 
 1410         return (ret);
 1411 }
 1412 
 1413 /*
 1414  * Iterate over all combinations of bad data and attempt a reconstruction.
 1415  * Note that the algorithm below is non-optimal because it doesn't take into
 1416  * account how reconstruction is actually performed. For example, with
 1417  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
 1418  * is targeted as invalid as if columns 1 and 4 are targeted since in both
 1419  * cases we'd only use parity information in column 0.
 1420  */
 1421 static int
 1422 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
 1423     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
 1424 {
 1425         raidz_col_t *rc;
 1426         void *orig[VDEV_RAIDZ_MAXPARITY];
 1427         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
 1428         int *tgts = &tstore[1];
 1429         int current, next, i, c, n;
 1430         int code, ret = 0;
 1431 
 1432         ASSERT(total_errors < rm->rm_firstdatacol);
 1433 
 1434         /*
 1435          * This simplifies one edge condition.
 1436          */
 1437         tgts[-1] = -1;
 1438 
 1439         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
 1440                 /*
 1441                  * Initialize the targets array by finding the first n columns
 1442                  * that contain no error.
 1443                  *
 1444                  * If there were no data errors, we need to ensure that we're
 1445                  * always explicitly attempting to reconstruct at least one
 1446                  * data column. To do this, we simply push the highest target
 1447                  * up into the data columns.
 1448                  */
 1449                 for (c = 0, i = 0; i < n; i++) {
 1450                         if (i == n - 1 && data_errors == 0 &&
 1451                             c < rm->rm_firstdatacol) {
 1452                                 c = rm->rm_firstdatacol;
 1453                         }
 1454 
 1455                         while (rm->rm_col[c].rc_error != 0) {
 1456                                 c++;
 1457                                 ASSERT3S(c, <, rm->rm_cols);
 1458                         }
 1459 
 1460                         tgts[i] = c++;
 1461                 }
 1462 
 1463                 /*
 1464                  * Setting tgts[n] simplifies the other edge condition.
 1465                  */
 1466                 tgts[n] = rm->rm_cols;
 1467 
 1468                 /*
 1469                  * These buffers were allocated in previous iterations.
 1470                  */
 1471                 for (i = 0; i < n - 1; i++) {
 1472                         ASSERT(orig[i] != NULL);
 1473                 }
 1474 
 1475                 orig[n - 1] = malloc(rm->rm_col[0].rc_size);
 1476                 if (orig[n - 1] == NULL) {
 1477                         ret = ENOMEM;
 1478                         goto done;
 1479                 }
 1480 
 1481                 current = 0;
 1482                 next = tgts[current];
 1483 
 1484                 while (current != n) {
 1485                         tgts[current] = next;
 1486                         current = 0;
 1487 
 1488                         /*
 1489                          * Save off the original data that we're going to
 1490                          * attempt to reconstruct.
 1491                          */
 1492                         for (i = 0; i < n; i++) {
 1493                                 ASSERT(orig[i] != NULL);
 1494                                 c = tgts[i];
 1495                                 ASSERT3S(c, >=, 0);
 1496                                 ASSERT3S(c, <, rm->rm_cols);
 1497                                 rc = &rm->rm_col[c];
 1498                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
 1499                         }
 1500 
 1501                         /*
 1502                          * Attempt a reconstruction and exit the outer loop on
 1503                          * success.
 1504                          */
 1505                         code = vdev_raidz_reconstruct(rm, tgts, n);
 1506                         if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
 1507                                 for (i = 0; i < n; i++) {
 1508                                         c = tgts[i];
 1509                                         rc = &rm->rm_col[c];
 1510                                         ASSERT(rc->rc_error == 0);
 1511                                         rc->rc_error = ECKSUM;
 1512                                 }
 1513 
 1514                                 ret = code;
 1515                                 goto done;
 1516                         }
 1517 
 1518                         /*
 1519                          * Restore the original data.
 1520                          */
 1521                         for (i = 0; i < n; i++) {
 1522                                 c = tgts[i];
 1523                                 rc = &rm->rm_col[c];
 1524                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
 1525                         }
 1526 
 1527                         do {
 1528                                 /*
 1529                                  * Find the next valid column after the current
 1530                                  * position..
 1531                                  */
 1532                                 for (next = tgts[current] + 1;
 1533                                     next < rm->rm_cols &&
 1534                                     rm->rm_col[next].rc_error != 0; next++)
 1535                                         continue;
 1536 
 1537                                 ASSERT(next <= tgts[current + 1]);
 1538 
 1539                                 /*
 1540                                  * If that spot is available, we're done here.
 1541                                  */
 1542                                 if (next != tgts[current + 1])
 1543                                         break;
 1544 
 1545                                 /*
 1546                                  * Otherwise, find the next valid column after
 1547                                  * the previous position.
 1548                                  */
 1549                                 for (c = tgts[current - 1] + 1;
 1550                                     rm->rm_col[c].rc_error != 0; c++)
 1551                                         continue;
 1552 
 1553                                 tgts[current] = c;
 1554                                 current++;
 1555 
 1556                         } while (current != n);
 1557                 }
 1558         }
 1559         n--;
 1560 done:
 1561         for (i = n - 1; i >= 0; i--) {
 1562                 free(orig[i]);
 1563         }
 1564 
 1565         return (ret);
 1566 }
 1567 
 1568 static int
 1569 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
 1570     off_t offset, size_t bytes)
 1571 {
 1572         vdev_t *tvd = vd->v_top;
 1573         vdev_t *cvd;
 1574         raidz_map_t *rm;
 1575         raidz_col_t *rc;
 1576         int c, error;
 1577         int unexpected_errors;
 1578         int parity_errors;
 1579         int parity_untried;
 1580         int data_errors;
 1581         int total_errors;
 1582         int n;
 1583         int tgts[VDEV_RAIDZ_MAXPARITY];
 1584         int code;
 1585 
 1586         rc = NULL;      /* gcc */
 1587         error = 0;
 1588 
 1589         rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
 1590             vd->v_nchildren, vd->v_nparity);
 1591         if (rm == NULL)
 1592                 return (ENOMEM);
 1593 
 1594         /*
 1595          * Iterate over the columns in reverse order so that we hit the parity
 1596          * last -- any errors along the way will force us to read the parity.
 1597          */
 1598         for (c = rm->rm_cols - 1; c >= 0; c--) {
 1599                 rc = &rm->rm_col[c];
 1600                 cvd = vdev_child(vd, rc->rc_devidx);
 1601                 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
 1602                         if (c >= rm->rm_firstdatacol)
 1603                                 rm->rm_missingdata++;
 1604                         else
 1605                                 rm->rm_missingparity++;
 1606                         rc->rc_error = ENXIO;
 1607                         rc->rc_tried = 1;       /* don't even try */
 1608                         rc->rc_skipped = 1;
 1609                         continue;
 1610                 }
 1611 #if 0           /* XXX: Too hard for the boot code. */
 1612                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 1613                         if (c >= rm->rm_firstdatacol)
 1614                                 rm->rm_missingdata++;
 1615                         else
 1616                                 rm->rm_missingparity++;
 1617                         rc->rc_error = ESTALE;
 1618                         rc->rc_skipped = 1;
 1619                         continue;
 1620                 }
 1621 #endif
 1622                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
 1623                         rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
 1624                             rc->rc_offset, rc->rc_size);
 1625                         rc->rc_tried = 1;
 1626                         rc->rc_skipped = 0;
 1627                 }
 1628         }
 1629 
 1630 reconstruct:
 1631         unexpected_errors = 0;
 1632         parity_errors = 0;
 1633         parity_untried = 0;
 1634         data_errors = 0;
 1635         total_errors = 0;
 1636 
 1637         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
 1638         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
 1639 
 1640         for (c = 0; c < rm->rm_cols; c++) {
 1641                 rc = &rm->rm_col[c];
 1642 
 1643                 if (rc->rc_error) {
 1644                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
 1645 
 1646                         if (c < rm->rm_firstdatacol)
 1647                                 parity_errors++;
 1648                         else
 1649                                 data_errors++;
 1650 
 1651                         if (!rc->rc_skipped)
 1652                                 unexpected_errors++;
 1653 
 1654                         total_errors++;
 1655                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
 1656                         parity_untried++;
 1657                 }
 1658         }
 1659 
 1660         /*
 1661          * There are three potential phases for a read:
 1662          *      1. produce valid data from the columns read
 1663          *      2. read all disks and try again
 1664          *      3. perform combinatorial reconstruction
 1665          *
 1666          * Each phase is progressively both more expensive and less likely to
 1667          * occur. If we encounter more errors than we can repair or all phases
 1668          * fail, we have no choice but to return an error.
 1669          */
 1670 
 1671         /*
 1672          * If the number of errors we saw was correctable -- less than or equal
 1673          * to the number of parity disks read -- attempt to produce data that
 1674          * has a valid checksum. Naturally, this case applies in the absence of
 1675          * any errors.
 1676          */
 1677         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
 1678                 int rv;
 1679 
 1680                 if (data_errors == 0) {
 1681                         rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
 1682                         if (rv == 0) {
 1683                                 /*
 1684                                  * If we read parity information (unnecessarily
 1685                                  * as it happens since no reconstruction was
 1686                                  * needed) regenerate and verify the parity.
 1687                                  * We also regenerate parity when resilvering
 1688                                  * so we can write it out to the failed device
 1689                                  * later.
 1690                                  */
 1691                                 if (parity_errors + parity_untried <
 1692                                     rm->rm_firstdatacol) {
 1693                                         n = raidz_parity_verify(rm);
 1694                                         unexpected_errors += n;
 1695                                         ASSERT(parity_errors + n <=
 1696                                             rm->rm_firstdatacol);
 1697                                 }
 1698                                 goto done;
 1699                         }
 1700                 } else {
 1701                         /*
 1702                          * We either attempt to read all the parity columns or
 1703                          * none of them. If we didn't try to read parity, we
 1704                          * wouldn't be here in the correctable case. There must
 1705                          * also have been fewer parity errors than parity
 1706                          * columns or, again, we wouldn't be in this code path.
 1707                          */
 1708                         ASSERT(parity_untried == 0);
 1709                         ASSERT(parity_errors < rm->rm_firstdatacol);
 1710 
 1711                         /*
 1712                          * Identify the data columns that reported an error.
 1713                          */
 1714                         n = 0;
 1715                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 1716                                 rc = &rm->rm_col[c];
 1717                                 if (rc->rc_error != 0) {
 1718                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
 1719                                         tgts[n++] = c;
 1720                                 }
 1721                         }
 1722 
 1723                         ASSERT(rm->rm_firstdatacol >= n);
 1724 
 1725                         code = vdev_raidz_reconstruct(rm, tgts, n);
 1726 
 1727                         rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
 1728                         if (rv == 0) {
 1729                                 /*
 1730                                  * If we read more parity disks than were used
 1731                                  * for reconstruction, confirm that the other
 1732                                  * parity disks produced correct data. This
 1733                                  * routine is suboptimal in that it regenerates
 1734                                  * the parity that we already used in addition
 1735                                  * to the parity that we're attempting to
 1736                                  * verify, but this should be a relatively
 1737                                  * uncommon case, and can be optimized if it
 1738                                  * becomes a problem. Note that we regenerate
 1739                                  * parity when resilvering so we can write it
 1740                                  * out to failed devices later.
 1741                                  */
 1742                                 if (parity_errors < rm->rm_firstdatacol - n) {
 1743                                         n = raidz_parity_verify(rm);
 1744                                         unexpected_errors += n;
 1745                                         ASSERT(parity_errors + n <=
 1746                                             rm->rm_firstdatacol);
 1747                                 }
 1748 
 1749                                 goto done;
 1750                         }
 1751                 }
 1752         }
 1753 
 1754         /*
 1755          * This isn't a typical situation -- either we got a read
 1756          * error or a child silently returned bad data. Read every
 1757          * block so we can try again with as much data and parity as
 1758          * we can track down. If we've already been through once
 1759          * before, all children will be marked as tried so we'll
 1760          * proceed to combinatorial reconstruction.
 1761          */
 1762         unexpected_errors = 1;
 1763         rm->rm_missingdata = 0;
 1764         rm->rm_missingparity = 0;
 1765 
 1766         n = 0;
 1767         for (c = 0; c < rm->rm_cols; c++) {
 1768                 rc = &rm->rm_col[c];
 1769 
 1770                 if (rc->rc_tried)
 1771                         continue;
 1772 
 1773                 cvd = vdev_child(vd, rc->rc_devidx);
 1774                 ASSERT(cvd != NULL);
 1775                 rc->rc_error = cvd->v_read(cvd, NULL,
 1776                     rc->rc_data, rc->rc_offset, rc->rc_size);
 1777                 if (rc->rc_error == 0)
 1778                         n++;
 1779                 rc->rc_tried = 1;
 1780                 rc->rc_skipped = 0;
 1781         }
 1782         /*
 1783          * If we managed to read anything more, retry the
 1784          * reconstruction.
 1785          */
 1786         if (n > 0)
 1787                 goto reconstruct;
 1788 
 1789         /*
 1790          * At this point we've attempted to reconstruct the data given the
 1791          * errors we detected, and we've attempted to read all columns. There
 1792          * must, therefore, be one or more additional problems -- silent errors
 1793          * resulting in invalid data rather than explicit I/O errors resulting
 1794          * in absent data. We check if there is enough additional data to
 1795          * possibly reconstruct the data and then perform combinatorial
 1796          * reconstruction over all possible combinations. If that fails,
 1797          * we're cooked.
 1798          */
 1799         if (total_errors > rm->rm_firstdatacol) {
 1800                 error = EIO;
 1801         } else if (total_errors < rm->rm_firstdatacol &&
 1802             (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
 1803              total_errors, data_errors)) != 0) {
 1804                 /*
 1805                  * If we didn't use all the available parity for the
 1806                  * combinatorial reconstruction, verify that the remaining
 1807                  * parity is correct.
 1808                  */
 1809                 if (code != (1 << rm->rm_firstdatacol) - 1)
 1810                         (void) raidz_parity_verify(rm);
 1811         } else {
 1812                 /*
 1813                  * We're here because either:
 1814                  *
 1815                  *      total_errors == rm_first_datacol, or
 1816                  *      vdev_raidz_combrec() failed
 1817                  *
 1818                  * In either case, there is enough bad data to prevent
 1819                  * reconstruction.
 1820                  *
 1821                  * Start checksum ereports for all children which haven't
 1822                  * failed, and the IO wasn't speculative.
 1823                  */
 1824                 error = ECKSUM;
 1825         }
 1826 
 1827 done:
 1828         vdev_raidz_map_free(rm);
 1829 
 1830         return (error);
 1831 }

Cache object: cf94a3a8014e0144e9ba5638db32fe96


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.