The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/lib/decompress_bunzip2.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      Small bzip2 deflate implementation, by Rob Landley (rob@landley.net).
    2 
    3         Based on bzip2 decompression code by Julian R Seward (jseward@acm.org),
    4         which also acknowledges contributions by Mike Burrows, David Wheeler,
    5         Peter Fenwick, Alistair Moffat, Radford Neal, Ian H. Witten,
    6         Robert Sedgewick, and Jon L. Bentley.
    7 
    8         This code is licensed under the LGPLv2:
    9                 LGPL (http://www.gnu.org/copyleft/lgpl.html
   10 */
   11 
   12 /*
   13         Size and speed optimizations by Manuel Novoa III  (mjn3@codepoet.org).
   14 
   15         More efficient reading of Huffman codes, a streamlined read_bunzip()
   16         function, and various other tweaks.  In (limited) tests, approximately
   17         20% faster than bzcat on x86 and about 10% faster on arm.
   18 
   19         Note that about 2/3 of the time is spent in read_unzip() reversing
   20         the Burrows-Wheeler transformation.  Much of that time is delay
   21         resulting from cache misses.
   22 
   23         I would ask that anyone benefiting from this work, especially those
   24         using it in commercial products, consider making a donation to my local
   25         non-profit hospice organization in the name of the woman I loved, who
   26         passed away Feb. 12, 2003.
   27 
   28                 In memory of Toni W. Hagan
   29 
   30                 Hospice of Acadiana, Inc.
   31                 2600 Johnston St., Suite 200
   32                 Lafayette, LA 70503-3240
   33 
   34                 Phone (337) 232-1234 or 1-800-738-2226
   35                 Fax   (337) 232-1297
   36 
   37                 http://www.hospiceacadiana.com/
   38 
   39         Manuel
   40  */
   41 
   42 /*
   43         Made it fit for running in Linux Kernel by Alain Knaff (alain@knaff.lu)
   44 */
   45 
   46 
   47 #ifdef STATIC
   48 #define PREBOOT
   49 #else
   50 #include <linux/decompress/bunzip2.h>
   51 #endif /* STATIC */
   52 
   53 #include <linux/decompress/mm.h>
   54 
   55 #ifndef INT_MAX
   56 #define INT_MAX 0x7fffffff
   57 #endif
   58 
   59 /* Constants for Huffman coding */
   60 #define MAX_GROUPS              6
   61 #define GROUP_SIZE              50      /* 64 would have been more efficient */
   62 #define MAX_HUFCODE_BITS        20      /* Longest Huffman code allowed */
   63 #define MAX_SYMBOLS             258     /* 256 literals + RUNA + RUNB */
   64 #define SYMBOL_RUNA             0
   65 #define SYMBOL_RUNB             1
   66 
   67 /* Status return values */
   68 #define RETVAL_OK                       0
   69 #define RETVAL_LAST_BLOCK               (-1)
   70 #define RETVAL_NOT_BZIP_DATA            (-2)
   71 #define RETVAL_UNEXPECTED_INPUT_EOF     (-3)
   72 #define RETVAL_UNEXPECTED_OUTPUT_EOF    (-4)
   73 #define RETVAL_DATA_ERROR               (-5)
   74 #define RETVAL_OUT_OF_MEMORY            (-6)
   75 #define RETVAL_OBSOLETE_INPUT           (-7)
   76 
   77 /* Other housekeeping constants */
   78 #define BZIP2_IOBUF_SIZE                4096
   79 
   80 /* This is what we know about each Huffman coding group */
   81 struct group_data {
   82         /* We have an extra slot at the end of limit[] for a sentinal value. */
   83         int limit[MAX_HUFCODE_BITS+1];
   84         int base[MAX_HUFCODE_BITS];
   85         int permute[MAX_SYMBOLS];
   86         int minLen, maxLen;
   87 };
   88 
   89 /* Structure holding all the housekeeping data, including IO buffers and
   90    memory that persists between calls to bunzip */
   91 struct bunzip_data {
   92         /* State for interrupting output loop */
   93         int writeCopies, writePos, writeRunCountdown, writeCount, writeCurrent;
   94         /* I/O tracking data (file handles, buffers, positions, etc.) */
   95         int (*fill)(void*, unsigned int);
   96         int inbufCount, inbufPos /*, outbufPos*/;
   97         unsigned char *inbuf /*,*outbuf*/;
   98         unsigned int inbufBitCount, inbufBits;
   99         /* The CRC values stored in the block header and calculated from the
  100         data */
  101         unsigned int crc32Table[256], headerCRC, totalCRC, writeCRC;
  102         /* Intermediate buffer and its size (in bytes) */
  103         unsigned int *dbuf, dbufSize;
  104         /* These things are a bit too big to go on the stack */
  105         unsigned char selectors[32768];         /* nSelectors = 15 bits */
  106         struct group_data groups[MAX_GROUPS];   /* Huffman coding tables */
  107         int io_error;                   /* non-zero if we have IO error */
  108         int byteCount[256];
  109         unsigned char symToByte[256], mtfSymbol[256];
  110 };
  111 
  112 
  113 /* Return the next nnn bits of input.  All reads from the compressed input
  114    are done through this function.  All reads are big endian */
  115 static unsigned int INIT get_bits(struct bunzip_data *bd, char bits_wanted)
  116 {
  117         unsigned int bits = 0;
  118 
  119         /* If we need to get more data from the byte buffer, do so.
  120            (Loop getting one byte at a time to enforce endianness and avoid
  121            unaligned access.) */
  122         while (bd->inbufBitCount < bits_wanted) {
  123                 /* If we need to read more data from file into byte buffer, do
  124                    so */
  125                 if (bd->inbufPos == bd->inbufCount) {
  126                         if (bd->io_error)
  127                                 return 0;
  128                         bd->inbufCount = bd->fill(bd->inbuf, BZIP2_IOBUF_SIZE);
  129                         if (bd->inbufCount <= 0) {
  130                                 bd->io_error = RETVAL_UNEXPECTED_INPUT_EOF;
  131                                 return 0;
  132                         }
  133                         bd->inbufPos = 0;
  134                 }
  135                 /* Avoid 32-bit overflow (dump bit buffer to top of output) */
  136                 if (bd->inbufBitCount >= 24) {
  137                         bits = bd->inbufBits&((1 << bd->inbufBitCount)-1);
  138                         bits_wanted -= bd->inbufBitCount;
  139                         bits <<= bits_wanted;
  140                         bd->inbufBitCount = 0;
  141                 }
  142                 /* Grab next 8 bits of input from buffer. */
  143                 bd->inbufBits = (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
  144                 bd->inbufBitCount += 8;
  145         }
  146         /* Calculate result */
  147         bd->inbufBitCount -= bits_wanted;
  148         bits |= (bd->inbufBits >> bd->inbufBitCount)&((1 << bits_wanted)-1);
  149 
  150         return bits;
  151 }
  152 
  153 /* Unpacks the next block and sets up for the inverse burrows-wheeler step. */
  154 
  155 static int INIT get_next_block(struct bunzip_data *bd)
  156 {
  157         struct group_data *hufGroup = NULL;
  158         int *base = NULL;
  159         int *limit = NULL;
  160         int dbufCount, nextSym, dbufSize, groupCount, selector,
  161                 i, j, k, t, runPos, symCount, symTotal, nSelectors, *byteCount;
  162         unsigned char uc, *symToByte, *mtfSymbol, *selectors;
  163         unsigned int *dbuf, origPtr;
  164 
  165         dbuf = bd->dbuf;
  166         dbufSize = bd->dbufSize;
  167         selectors = bd->selectors;
  168         byteCount = bd->byteCount;
  169         symToByte = bd->symToByte;
  170         mtfSymbol = bd->mtfSymbol;
  171 
  172         /* Read in header signature and CRC, then validate signature.
  173            (last block signature means CRC is for whole file, return now) */
  174         i = get_bits(bd, 24);
  175         j = get_bits(bd, 24);
  176         bd->headerCRC = get_bits(bd, 32);
  177         if ((i == 0x177245) && (j == 0x385090))
  178                 return RETVAL_LAST_BLOCK;
  179         if ((i != 0x314159) || (j != 0x265359))
  180                 return RETVAL_NOT_BZIP_DATA;
  181         /* We can add support for blockRandomised if anybody complains.
  182            There was some code for this in busybox 1.0.0-pre3, but nobody ever
  183            noticed that it didn't actually work. */
  184         if (get_bits(bd, 1))
  185                 return RETVAL_OBSOLETE_INPUT;
  186         origPtr = get_bits(bd, 24);
  187         if (origPtr > dbufSize)
  188                 return RETVAL_DATA_ERROR;
  189         /* mapping table: if some byte values are never used (encoding things
  190            like ascii text), the compression code removes the gaps to have fewer
  191            symbols to deal with, and writes a sparse bitfield indicating which
  192            values were present.  We make a translation table to convert the
  193            symbols back to the corresponding bytes. */
  194         t = get_bits(bd, 16);
  195         symTotal = 0;
  196         for (i = 0; i < 16; i++) {
  197                 if (t&(1 << (15-i))) {
  198                         k = get_bits(bd, 16);
  199                         for (j = 0; j < 16; j++)
  200                                 if (k&(1 << (15-j)))
  201                                         symToByte[symTotal++] = (16*i)+j;
  202                 }
  203         }
  204         /* How many different Huffman coding groups does this block use? */
  205         groupCount = get_bits(bd, 3);
  206         if (groupCount < 2 || groupCount > MAX_GROUPS)
  207                 return RETVAL_DATA_ERROR;
  208         /* nSelectors: Every GROUP_SIZE many symbols we select a new
  209            Huffman coding group.  Read in the group selector list,
  210            which is stored as MTF encoded bit runs.  (MTF = Move To
  211            Front, as each value is used it's moved to the start of the
  212            list.) */
  213         nSelectors = get_bits(bd, 15);
  214         if (!nSelectors)
  215                 return RETVAL_DATA_ERROR;
  216         for (i = 0; i < groupCount; i++)
  217                 mtfSymbol[i] = i;
  218         for (i = 0; i < nSelectors; i++) {
  219                 /* Get next value */
  220                 for (j = 0; get_bits(bd, 1); j++)
  221                         if (j >= groupCount)
  222                                 return RETVAL_DATA_ERROR;
  223                 /* Decode MTF to get the next selector */
  224                 uc = mtfSymbol[j];
  225                 for (; j; j--)
  226                         mtfSymbol[j] = mtfSymbol[j-1];
  227                 mtfSymbol[0] = selectors[i] = uc;
  228         }
  229         /* Read the Huffman coding tables for each group, which code
  230            for symTotal literal symbols, plus two run symbols (RUNA,
  231            RUNB) */
  232         symCount = symTotal+2;
  233         for (j = 0; j < groupCount; j++) {
  234                 unsigned char length[MAX_SYMBOLS], temp[MAX_HUFCODE_BITS+1];
  235                 int     minLen, maxLen, pp;
  236                 /* Read Huffman code lengths for each symbol.  They're
  237                    stored in a way similar to mtf; record a starting
  238                    value for the first symbol, and an offset from the
  239                    previous value for everys symbol after that.
  240                    (Subtracting 1 before the loop and then adding it
  241                    back at the end is an optimization that makes the
  242                    test inside the loop simpler: symbol length 0
  243                    becomes negative, so an unsigned inequality catches
  244                    it.) */
  245                 t = get_bits(bd, 5)-1;
  246                 for (i = 0; i < symCount; i++) {
  247                         for (;;) {
  248                                 if (((unsigned)t) > (MAX_HUFCODE_BITS-1))
  249                                         return RETVAL_DATA_ERROR;
  250 
  251                                 /* If first bit is 0, stop.  Else
  252                                    second bit indicates whether to
  253                                    increment or decrement the value.
  254                                    Optimization: grab 2 bits and unget
  255                                    the second if the first was 0. */
  256 
  257                                 k = get_bits(bd, 2);
  258                                 if (k < 2) {
  259                                         bd->inbufBitCount++;
  260                                         break;
  261                                 }
  262                                 /* Add one if second bit 1, else
  263                                  * subtract 1.  Avoids if/else */
  264                                 t += (((k+1)&2)-1);
  265                         }
  266                         /* Correct for the initial -1, to get the
  267                          * final symbol length */
  268                         length[i] = t+1;
  269                 }
  270                 /* Find largest and smallest lengths in this group */
  271                 minLen = maxLen = length[0];
  272 
  273                 for (i = 1; i < symCount; i++) {
  274                         if (length[i] > maxLen)
  275                                 maxLen = length[i];
  276                         else if (length[i] < minLen)
  277                                 minLen = length[i];
  278                 }
  279 
  280                 /* Calculate permute[], base[], and limit[] tables from
  281                  * length[].
  282                  *
  283                  * permute[] is the lookup table for converting
  284                  * Huffman coded symbols into decoded symbols.  base[]
  285                  * is the amount to subtract from the value of a
  286                  * Huffman symbol of a given length when using
  287                  * permute[].
  288                  *
  289                  * limit[] indicates the largest numerical value a
  290                  * symbol with a given number of bits can have.  This
  291                  * is how the Huffman codes can vary in length: each
  292                  * code with a value > limit[length] needs another
  293                  * bit.
  294                  */
  295                 hufGroup = bd->groups+j;
  296                 hufGroup->minLen = minLen;
  297                 hufGroup->maxLen = maxLen;
  298                 /* Note that minLen can't be smaller than 1, so we
  299                    adjust the base and limit array pointers so we're
  300                    not always wasting the first entry.  We do this
  301                    again when using them (during symbol decoding).*/
  302                 base = hufGroup->base-1;
  303                 limit = hufGroup->limit-1;
  304                 /* Calculate permute[].  Concurrently, initialize
  305                  * temp[] and limit[]. */
  306                 pp = 0;
  307                 for (i = minLen; i <= maxLen; i++) {
  308                         temp[i] = limit[i] = 0;
  309                         for (t = 0; t < symCount; t++)
  310                                 if (length[t] == i)
  311                                         hufGroup->permute[pp++] = t;
  312                 }
  313                 /* Count symbols coded for at each bit length */
  314                 for (i = 0; i < symCount; i++)
  315                         temp[length[i]]++;
  316                 /* Calculate limit[] (the largest symbol-coding value
  317                  *at each bit length, which is (previous limit <<
  318                  *1)+symbols at this level), and base[] (number of
  319                  *symbols to ignore at each bit length, which is limit
  320                  *minus the cumulative count of symbols coded for
  321                  *already). */
  322                 pp = t = 0;
  323                 for (i = minLen; i < maxLen; i++) {
  324                         pp += temp[i];
  325                         /* We read the largest possible symbol size
  326                            and then unget bits after determining how
  327                            many we need, and those extra bits could be
  328                            set to anything.  (They're noise from
  329                            future symbols.)  At each level we're
  330                            really only interested in the first few
  331                            bits, so here we set all the trailing
  332                            to-be-ignored bits to 1 so they don't
  333                            affect the value > limit[length]
  334                            comparison. */
  335                         limit[i] = (pp << (maxLen - i)) - 1;
  336                         pp <<= 1;
  337                         base[i+1] = pp-(t += temp[i]);
  338                 }
  339                 limit[maxLen+1] = INT_MAX; /* Sentinal value for
  340                                             * reading next sym. */
  341                 limit[maxLen] = pp+temp[maxLen]-1;
  342                 base[minLen] = 0;
  343         }
  344         /* We've finished reading and digesting the block header.  Now
  345            read this block's Huffman coded symbols from the file and
  346            undo the Huffman coding and run length encoding, saving the
  347            result into dbuf[dbufCount++] = uc */
  348 
  349         /* Initialize symbol occurrence counters and symbol Move To
  350          * Front table */
  351         for (i = 0; i < 256; i++) {
  352                 byteCount[i] = 0;
  353                 mtfSymbol[i] = (unsigned char)i;
  354         }
  355         /* Loop through compressed symbols. */
  356         runPos = dbufCount = symCount = selector = 0;
  357         for (;;) {
  358                 /* Determine which Huffman coding group to use. */
  359                 if (!(symCount--)) {
  360                         symCount = GROUP_SIZE-1;
  361                         if (selector >= nSelectors)
  362                                 return RETVAL_DATA_ERROR;
  363                         hufGroup = bd->groups+selectors[selector++];
  364                         base = hufGroup->base-1;
  365                         limit = hufGroup->limit-1;
  366                 }
  367                 /* Read next Huffman-coded symbol. */
  368                 /* Note: It is far cheaper to read maxLen bits and
  369                    back up than it is to read minLen bits and then an
  370                    additional bit at a time, testing as we go.
  371                    Because there is a trailing last block (with file
  372                    CRC), there is no danger of the overread causing an
  373                    unexpected EOF for a valid compressed file.  As a
  374                    further optimization, we do the read inline
  375                    (falling back to a call to get_bits if the buffer
  376                    runs dry).  The following (up to got_huff_bits:) is
  377                    equivalent to j = get_bits(bd, hufGroup->maxLen);
  378                  */
  379                 while (bd->inbufBitCount < hufGroup->maxLen) {
  380                         if (bd->inbufPos == bd->inbufCount) {
  381                                 j = get_bits(bd, hufGroup->maxLen);
  382                                 goto got_huff_bits;
  383                         }
  384                         bd->inbufBits =
  385                                 (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
  386                         bd->inbufBitCount += 8;
  387                 };
  388                 bd->inbufBitCount -= hufGroup->maxLen;
  389                 j = (bd->inbufBits >> bd->inbufBitCount)&
  390                         ((1 << hufGroup->maxLen)-1);
  391 got_huff_bits:
  392                 /* Figure how how many bits are in next symbol and
  393                  * unget extras */
  394                 i = hufGroup->minLen;
  395                 while (j > limit[i])
  396                         ++i;
  397                 bd->inbufBitCount += (hufGroup->maxLen - i);
  398                 /* Huffman decode value to get nextSym (with bounds checking) */
  399                 if ((i > hufGroup->maxLen)
  400                         || (((unsigned)(j = (j>>(hufGroup->maxLen-i))-base[i]))
  401                                 >= MAX_SYMBOLS))
  402                         return RETVAL_DATA_ERROR;
  403                 nextSym = hufGroup->permute[j];
  404                 /* We have now decoded the symbol, which indicates
  405                    either a new literal byte, or a repeated run of the
  406                    most recent literal byte.  First, check if nextSym
  407                    indicates a repeated run, and if so loop collecting
  408                    how many times to repeat the last literal. */
  409                 if (((unsigned)nextSym) <= SYMBOL_RUNB) { /* RUNA or RUNB */
  410                         /* If this is the start of a new run, zero out
  411                          * counter */
  412                         if (!runPos) {
  413                                 runPos = 1;
  414                                 t = 0;
  415                         }
  416                         /* Neat trick that saves 1 symbol: instead of
  417                            or-ing 0 or 1 at each bit position, add 1
  418                            or 2 instead.  For example, 1011 is 1 << 0
  419                            + 1 << 1 + 2 << 2.  1010 is 2 << 0 + 2 << 1
  420                            + 1 << 2.  You can make any bit pattern
  421                            that way using 1 less symbol than the basic
  422                            or 0/1 method (except all bits 0, which
  423                            would use no symbols, but a run of length 0
  424                            doesn't mean anything in this context).
  425                            Thus space is saved. */
  426                         t += (runPos << nextSym);
  427                         /* +runPos if RUNA; +2*runPos if RUNB */
  428 
  429                         runPos <<= 1;
  430                         continue;
  431                 }
  432                 /* When we hit the first non-run symbol after a run,
  433                    we now know how many times to repeat the last
  434                    literal, so append that many copies to our buffer
  435                    of decoded symbols (dbuf) now.  (The last literal
  436                    used is the one at the head of the mtfSymbol
  437                    array.) */
  438                 if (runPos) {
  439                         runPos = 0;
  440                         if (dbufCount+t >= dbufSize)
  441                                 return RETVAL_DATA_ERROR;
  442 
  443                         uc = symToByte[mtfSymbol[0]];
  444                         byteCount[uc] += t;
  445                         while (t--)
  446                                 dbuf[dbufCount++] = uc;
  447                 }
  448                 /* Is this the terminating symbol? */
  449                 if (nextSym > symTotal)
  450                         break;
  451                 /* At this point, nextSym indicates a new literal
  452                    character.  Subtract one to get the position in the
  453                    MTF array at which this literal is currently to be
  454                    found.  (Note that the result can't be -1 or 0,
  455                    because 0 and 1 are RUNA and RUNB.  But another
  456                    instance of the first symbol in the mtf array,
  457                    position 0, would have been handled as part of a
  458                    run above.  Therefore 1 unused mtf position minus 2
  459                    non-literal nextSym values equals -1.) */
  460                 if (dbufCount >= dbufSize)
  461                         return RETVAL_DATA_ERROR;
  462                 i = nextSym - 1;
  463                 uc = mtfSymbol[i];
  464                 /* Adjust the MTF array.  Since we typically expect to
  465                  *move only a small number of symbols, and are bound
  466                  *by 256 in any case, using memmove here would
  467                  *typically be bigger and slower due to function call
  468                  *overhead and other assorted setup costs. */
  469                 do {
  470                         mtfSymbol[i] = mtfSymbol[i-1];
  471                 } while (--i);
  472                 mtfSymbol[0] = uc;
  473                 uc = symToByte[uc];
  474                 /* We have our literal byte.  Save it into dbuf. */
  475                 byteCount[uc]++;
  476                 dbuf[dbufCount++] = (unsigned int)uc;
  477         }
  478         /* At this point, we've read all the Huffman-coded symbols
  479            (and repeated runs) for this block from the input stream,
  480            and decoded them into the intermediate buffer.  There are
  481            dbufCount many decoded bytes in dbuf[].  Now undo the
  482            Burrows-Wheeler transform on dbuf.  See
  483            http://dogma.net/markn/articles/bwt/bwt.htm
  484          */
  485         /* Turn byteCount into cumulative occurrence counts of 0 to n-1. */
  486         j = 0;
  487         for (i = 0; i < 256; i++) {
  488                 k = j+byteCount[i];
  489                 byteCount[i] = j;
  490                 j = k;
  491         }
  492         /* Figure out what order dbuf would be in if we sorted it. */
  493         for (i = 0; i < dbufCount; i++) {
  494                 uc = (unsigned char)(dbuf[i] & 0xff);
  495                 dbuf[byteCount[uc]] |= (i << 8);
  496                 byteCount[uc]++;
  497         }
  498         /* Decode first byte by hand to initialize "previous" byte.
  499            Note that it doesn't get output, and if the first three
  500            characters are identical it doesn't qualify as a run (hence
  501            writeRunCountdown = 5). */
  502         if (dbufCount) {
  503                 if (origPtr >= dbufCount)
  504                         return RETVAL_DATA_ERROR;
  505                 bd->writePos = dbuf[origPtr];
  506                 bd->writeCurrent = (unsigned char)(bd->writePos&0xff);
  507                 bd->writePos >>= 8;
  508                 bd->writeRunCountdown = 5;
  509         }
  510         bd->writeCount = dbufCount;
  511 
  512         return RETVAL_OK;
  513 }
  514 
  515 /* Undo burrows-wheeler transform on intermediate buffer to produce output.
  516    If start_bunzip was initialized with out_fd =-1, then up to len bytes of
  517    data are written to outbuf.  Return value is number of bytes written or
  518    error (all errors are negative numbers).  If out_fd!=-1, outbuf and len
  519    are ignored, data is written to out_fd and return is RETVAL_OK or error.
  520 */
  521 
  522 static int INIT read_bunzip(struct bunzip_data *bd, char *outbuf, int len)
  523 {
  524         const unsigned int *dbuf;
  525         int pos, xcurrent, previous, gotcount;
  526 
  527         /* If last read was short due to end of file, return last block now */
  528         if (bd->writeCount < 0)
  529                 return bd->writeCount;
  530 
  531         gotcount = 0;
  532         dbuf = bd->dbuf;
  533         pos = bd->writePos;
  534         xcurrent = bd->writeCurrent;
  535 
  536         /* We will always have pending decoded data to write into the output
  537            buffer unless this is the very first call (in which case we haven't
  538            Huffman-decoded a block into the intermediate buffer yet). */
  539 
  540         if (bd->writeCopies) {
  541                 /* Inside the loop, writeCopies means extra copies (beyond 1) */
  542                 --bd->writeCopies;
  543                 /* Loop outputting bytes */
  544                 for (;;) {
  545                         /* If the output buffer is full, snapshot
  546                          * state and return */
  547                         if (gotcount >= len) {
  548                                 bd->writePos = pos;
  549                                 bd->writeCurrent = xcurrent;
  550                                 bd->writeCopies++;
  551                                 return len;
  552                         }
  553                         /* Write next byte into output buffer, updating CRC */
  554                         outbuf[gotcount++] = xcurrent;
  555                         bd->writeCRC = (((bd->writeCRC) << 8)
  556                                 ^bd->crc32Table[((bd->writeCRC) >> 24)
  557                                 ^xcurrent]);
  558                         /* Loop now if we're outputting multiple
  559                          * copies of this byte */
  560                         if (bd->writeCopies) {
  561                                 --bd->writeCopies;
  562                                 continue;
  563                         }
  564 decode_next_byte:
  565                         if (!bd->writeCount--)
  566                                 break;
  567                         /* Follow sequence vector to undo
  568                          * Burrows-Wheeler transform */
  569                         previous = xcurrent;
  570                         pos = dbuf[pos];
  571                         xcurrent = pos&0xff;
  572                         pos >>= 8;
  573                         /* After 3 consecutive copies of the same
  574                            byte, the 4th is a repeat count.  We count
  575                            down from 4 instead *of counting up because
  576                            testing for non-zero is faster */
  577                         if (--bd->writeRunCountdown) {
  578                                 if (xcurrent != previous)
  579                                         bd->writeRunCountdown = 4;
  580                         } else {
  581                                 /* We have a repeated run, this byte
  582                                  * indicates the count */
  583                                 bd->writeCopies = xcurrent;
  584                                 xcurrent = previous;
  585                                 bd->writeRunCountdown = 5;
  586                                 /* Sometimes there are just 3 bytes
  587                                  * (run length 0) */
  588                                 if (!bd->writeCopies)
  589                                         goto decode_next_byte;
  590                                 /* Subtract the 1 copy we'd output
  591                                  * anyway to get extras */
  592                                 --bd->writeCopies;
  593                         }
  594                 }
  595                 /* Decompression of this block completed successfully */
  596                 bd->writeCRC = ~bd->writeCRC;
  597                 bd->totalCRC = ((bd->totalCRC << 1) |
  598                                 (bd->totalCRC >> 31)) ^ bd->writeCRC;
  599                 /* If this block had a CRC error, force file level CRC error. */
  600                 if (bd->writeCRC != bd->headerCRC) {
  601                         bd->totalCRC = bd->headerCRC+1;
  602                         return RETVAL_LAST_BLOCK;
  603                 }
  604         }
  605 
  606         /* Refill the intermediate buffer by Huffman-decoding next
  607          * block of input */
  608         /* (previous is just a convenient unused temp variable here) */
  609         previous = get_next_block(bd);
  610         if (previous) {
  611                 bd->writeCount = previous;
  612                 return (previous != RETVAL_LAST_BLOCK) ? previous : gotcount;
  613         }
  614         bd->writeCRC = 0xffffffffUL;
  615         pos = bd->writePos;
  616         xcurrent = bd->writeCurrent;
  617         goto decode_next_byte;
  618 }
  619 
  620 static int INIT nofill(void *buf, unsigned int len)
  621 {
  622         return -1;
  623 }
  624 
  625 /* Allocate the structure, read file header.  If in_fd ==-1, inbuf must contain
  626    a complete bunzip file (len bytes long).  If in_fd!=-1, inbuf and len are
  627    ignored, and data is read from file handle into temporary buffer. */
  628 static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, int len,
  629                              int (*fill)(void*, unsigned int))
  630 {
  631         struct bunzip_data *bd;
  632         unsigned int i, j, c;
  633         const unsigned int BZh0 =
  634                 (((unsigned int)'B') << 24)+(((unsigned int)'Z') << 16)
  635                 +(((unsigned int)'h') << 8)+(unsigned int)'';
  636 
  637         /* Figure out how much data to allocate */
  638         i = sizeof(struct bunzip_data);
  639 
  640         /* Allocate bunzip_data.  Most fields initialize to zero. */
  641         bd = *bdp = malloc(i);
  642         if (!bd)
  643                 return RETVAL_OUT_OF_MEMORY;
  644         memset(bd, 0, sizeof(struct bunzip_data));
  645         /* Setup input buffer */
  646         bd->inbuf = inbuf;
  647         bd->inbufCount = len;
  648         if (fill != NULL)
  649                 bd->fill = fill;
  650         else
  651                 bd->fill = nofill;
  652 
  653         /* Init the CRC32 table (big endian) */
  654         for (i = 0; i < 256; i++) {
  655                 c = i << 24;
  656                 for (j = 8; j; j--)
  657                         c = c&0x80000000 ? (c << 1)^0x04c11db7 : (c << 1);
  658                 bd->crc32Table[i] = c;
  659         }
  660 
  661         /* Ensure that file starts with "BZh['1'-'9']." */
  662         i = get_bits(bd, 32);
  663         if (((unsigned int)(i-BZh0-1)) >= 9)
  664                 return RETVAL_NOT_BZIP_DATA;
  665 
  666         /* Fourth byte (ascii '1'-'9'), indicates block size in units of 100k of
  667            uncompressed data.  Allocate intermediate buffer for block. */
  668         bd->dbufSize = 100000*(i-BZh0);
  669 
  670         bd->dbuf = large_malloc(bd->dbufSize * sizeof(int));
  671         if (!bd->dbuf)
  672                 return RETVAL_OUT_OF_MEMORY;
  673         return RETVAL_OK;
  674 }
  675 
  676 /* Example usage: decompress src_fd to dst_fd.  (Stops at end of bzip2 data,
  677    not end of file.) */
  678 STATIC int INIT bunzip2(unsigned char *buf, int len,
  679                         int(*fill)(void*, unsigned int),
  680                         int(*flush)(void*, unsigned int),
  681                         unsigned char *outbuf,
  682                         int *pos,
  683                         void(*error)(char *x))
  684 {
  685         struct bunzip_data *bd;
  686         int i = -1;
  687         unsigned char *inbuf;
  688 
  689         if (flush)
  690                 outbuf = malloc(BZIP2_IOBUF_SIZE);
  691 
  692         if (!outbuf) {
  693                 error("Could not allocate output buffer");
  694                 return RETVAL_OUT_OF_MEMORY;
  695         }
  696         if (buf)
  697                 inbuf = buf;
  698         else
  699                 inbuf = malloc(BZIP2_IOBUF_SIZE);
  700         if (!inbuf) {
  701                 error("Could not allocate input buffer");
  702                 i = RETVAL_OUT_OF_MEMORY;
  703                 goto exit_0;
  704         }
  705         i = start_bunzip(&bd, inbuf, len, fill);
  706         if (!i) {
  707                 for (;;) {
  708                         i = read_bunzip(bd, outbuf, BZIP2_IOBUF_SIZE);
  709                         if (i <= 0)
  710                                 break;
  711                         if (!flush)
  712                                 outbuf += i;
  713                         else
  714                                 if (i != flush(outbuf, i)) {
  715                                         i = RETVAL_UNEXPECTED_OUTPUT_EOF;
  716                                         break;
  717                                 }
  718                 }
  719         }
  720         /* Check CRC and release memory */
  721         if (i == RETVAL_LAST_BLOCK) {
  722                 if (bd->headerCRC != bd->totalCRC)
  723                         error("Data integrity error when decompressing.");
  724                 else
  725                         i = RETVAL_OK;
  726         } else if (i == RETVAL_UNEXPECTED_OUTPUT_EOF) {
  727                 error("Compressed file ends unexpectedly");
  728         }
  729         if (!bd)
  730                 goto exit_1;
  731         if (bd->dbuf)
  732                 large_free(bd->dbuf);
  733         if (pos)
  734                 *pos = bd->inbufPos;
  735         free(bd);
  736 exit_1:
  737         if (!buf)
  738                 free(inbuf);
  739 exit_0:
  740         if (flush)
  741                 free(outbuf);
  742         return i;
  743 }
  744 
  745 #ifdef PREBOOT
  746 STATIC int INIT decompress(unsigned char *buf, int len,
  747                         int(*fill)(void*, unsigned int),
  748                         int(*flush)(void*, unsigned int),
  749                         unsigned char *outbuf,
  750                         int *pos,
  751                         void(*error)(char *x))
  752 {
  753         return bunzip2(buf, len - 4, fill, flush, outbuf, pos, error);
  754 }
  755 #endif

Cache object: f944e5a1ac175f2e3ab642fd2fe95573


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.