The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/lib/libkern/softfloat.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /* $NetBSD: softfloat.c,v 1.3 2003/12/04 13:57:31 keihan Exp $ */
    2 
    3 /*
    4  * This version hacked for use with gcc -msoft-float by bjh21.
    5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
    6  *  itself).
    7  */
    8 
    9 /*
   10  * Things you may want to define:
   11  *
   12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
   13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
   14  *   properly renamed.
   15  */
   16 
   17 /*
   18 ===============================================================================
   19 
   20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
   21 Arithmetic Package, Release 2a.
   22 
   23 Written by John R. Hauser.  This work was made possible in part by the
   24 International Computer Science Institute, located at Suite 600, 1947 Center
   25 Street, Berkeley, California 94704.  Funding was partially provided by the
   26 National Science Foundation under grant MIP-9311980.  The original version
   27 of this code was written as part of a project to build a fixed-point vector
   28 processor in collaboration with the University of California at Berkeley,
   29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
   30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
   31 arithmetic/SoftFloat.html'.
   32 
   33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
   34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
   35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
   36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
   37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
   38 
   39 Derivative works are acceptable, even for commercial purposes, so long as
   40 (1) they include prominent notice that the work is derivative, and (2) they
   41 include prominent notice akin to these four paragraphs for those parts of
   42 this code that are retained.
   43 
   44 ===============================================================================
   45 */
   46 
   47 /* If you need this in a boot program, you have bigger problems... */
   48 #ifndef _STANDALONE
   49 
   50 #include <sys/cdefs.h>
   51 #if defined(LIBC_SCCS) && !defined(lint)
   52 __RCSID("$NetBSD: softfloat.c,v 1.3 2003/12/04 13:57:31 keihan Exp $");
   53 #endif /* LIBC_SCCS and not lint */
   54 
   55 #ifdef SOFTFLOAT_FOR_GCC
   56 #include "softfloat-for-gcc.h"
   57 #endif
   58 
   59 #include "milieu.h"
   60 #include "softfloat.h"
   61 
   62 /*
   63  * Conversions between floats as stored in memory and floats as
   64  * SoftFloat uses them
   65  */
   66 #ifndef FLOAT64_DEMANGLE
   67 #define FLOAT64_DEMANGLE(a)     (a)
   68 #endif
   69 #ifndef FLOAT64_MANGLE
   70 #define FLOAT64_MANGLE(a)       (a)
   71 #endif
   72 
   73 /*
   74 -------------------------------------------------------------------------------
   75 Floating-point rounding mode, extended double-precision rounding precision,
   76 and exception flags.
   77 -------------------------------------------------------------------------------
   78 */
   79 
   80 /*
   81  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
   82  *      Right now, it does not.  I've removed all other dynamic global
   83  *      variables. [ross]
   84  */
   85 #ifdef FLOATX80
   86 int8 floatx80_rounding_precision = 80;
   87 #endif
   88 
   89 /*
   90 -------------------------------------------------------------------------------
   91 Primitive arithmetic functions, including multi-word arithmetic, and
   92 division and square root approximations.  (Can be specialized to target if
   93 desired.)
   94 -------------------------------------------------------------------------------
   95 */
   96 #include "softfloat-macros.h"
   97 
   98 /*
   99 -------------------------------------------------------------------------------
  100 Functions and definitions to determine:  (1) whether tininess for underflow
  101 is detected before or after rounding by default, (2) what (if anything)
  102 happens when exceptions are raised, (3) how signaling NaNs are distinguished
  103 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  104 are propagated from function inputs to output.  These details are target-
  105 specific.
  106 -------------------------------------------------------------------------------
  107 */
  108 #include "softfloat-specialize.h"
  109 
  110 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
  111 /*
  112 -------------------------------------------------------------------------------
  113 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  114 and 7, and returns the properly rounded 32-bit integer corresponding to the
  115 input.  If `zSign' is 1, the input is negated before being converted to an
  116 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  117 is simply rounded to an integer, with the inexact exception raised if the
  118 input cannot be represented exactly as an integer.  However, if the fixed-
  119 point input is too large, the invalid exception is raised and the largest
  120 positive or negative integer is returned.
  121 -------------------------------------------------------------------------------
  122 */
  123 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
  124 {
  125     int8 roundingMode;
  126     flag roundNearestEven;
  127     int8 roundIncrement, roundBits;
  128     int32 z;
  129 
  130     roundingMode = float_rounding_mode();
  131     roundNearestEven = ( roundingMode == float_round_nearest_even );
  132     roundIncrement = 0x40;
  133     if ( ! roundNearestEven ) {
  134         if ( roundingMode == float_round_to_zero ) {
  135             roundIncrement = 0;
  136         }
  137         else {
  138             roundIncrement = 0x7F;
  139             if ( zSign ) {
  140                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  141             }
  142             else {
  143                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  144             }
  145         }
  146     }
  147     roundBits = absZ & 0x7F;
  148     absZ = ( absZ + roundIncrement )>>7;
  149     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
  150     z = absZ;
  151     if ( zSign ) z = - z;
  152     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
  153         float_raise( float_flag_invalid );
  154         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
  155     }
  156     if ( roundBits ) float_set_inexact();
  157     return z;
  158 
  159 }
  160 
  161 /*
  162 -------------------------------------------------------------------------------
  163 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  164 `absZ1', with binary point between bits 63 and 64 (between the input words),
  165 and returns the properly rounded 64-bit integer corresponding to the input.
  166 If `zSign' is 1, the input is negated before being converted to an integer.
  167 Ordinarily, the fixed-point input is simply rounded to an integer, with
  168 the inexact exception raised if the input cannot be represented exactly as
  169 an integer.  However, if the fixed-point input is too large, the invalid
  170 exception is raised and the largest positive or negative integer is
  171 returned.
  172 -------------------------------------------------------------------------------
  173 */
  174 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
  175 {
  176     int8 roundingMode;
  177     flag roundNearestEven, increment;
  178     int64 z;
  179 
  180     roundingMode = float_rounding_mode();
  181     roundNearestEven = ( roundingMode == float_round_nearest_even );
  182     increment = ( (sbits64) absZ1 < 0 );
  183     if ( ! roundNearestEven ) {
  184         if ( roundingMode == float_round_to_zero ) {
  185             increment = 0;
  186         }
  187         else {
  188             if ( zSign ) {
  189                 increment = ( roundingMode == float_round_down ) && absZ1;
  190             }
  191             else {
  192                 increment = ( roundingMode == float_round_up ) && absZ1;
  193             }
  194         }
  195     }
  196     if ( increment ) {
  197         ++absZ0;
  198         if ( absZ0 == 0 ) goto overflow;
  199         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
  200     }
  201     z = absZ0;
  202     if ( zSign ) z = - z;
  203     if ( z && ( ( z < 0 ) ^ zSign ) ) {
  204  overflow:
  205         float_raise( float_flag_invalid );
  206         return
  207               zSign ? (sbits64) LIT64( 0x8000000000000000 )
  208             : LIT64( 0x7FFFFFFFFFFFFFFF );
  209     }
  210     if ( absZ1 ) float_set_inexact();
  211     return z;
  212 
  213 }
  214 #endif
  215 
  216 /*
  217 -------------------------------------------------------------------------------
  218 Returns the fraction bits of the single-precision floating-point value `a'.
  219 -------------------------------------------------------------------------------
  220 */
  221 INLINE bits32 extractFloat32Frac( float32 a )
  222 {
  223 
  224     return a & 0x007FFFFF;
  225 
  226 }
  227 
  228 /*
  229 -------------------------------------------------------------------------------
  230 Returns the exponent bits of the single-precision floating-point value `a'.
  231 -------------------------------------------------------------------------------
  232 */
  233 INLINE int16 extractFloat32Exp( float32 a )
  234 {
  235 
  236     return ( a>>23 ) & 0xFF;
  237 
  238 }
  239 
  240 /*
  241 -------------------------------------------------------------------------------
  242 Returns the sign bit of the single-precision floating-point value `a'.
  243 -------------------------------------------------------------------------------
  244 */
  245 INLINE flag extractFloat32Sign( float32 a )
  246 {
  247 
  248     return a>>31;
  249 
  250 }
  251 
  252 /*
  253 -------------------------------------------------------------------------------
  254 Normalizes the subnormal single-precision floating-point value represented
  255 by the denormalized significand `aSig'.  The normalized exponent and
  256 significand are stored at the locations pointed to by `zExpPtr' and
  257 `zSigPtr', respectively.
  258 -------------------------------------------------------------------------------
  259 */
  260 static void
  261  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
  262 {
  263     int8 shiftCount;
  264 
  265     shiftCount = countLeadingZeros32( aSig ) - 8;
  266     *zSigPtr = aSig<<shiftCount;
  267     *zExpPtr = 1 - shiftCount;
  268 
  269 }
  270 
  271 /*
  272 -------------------------------------------------------------------------------
  273 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  274 single-precision floating-point value, returning the result.  After being
  275 shifted into the proper positions, the three fields are simply added
  276 together to form the result.  This means that any integer portion of `zSig'
  277 will be added into the exponent.  Since a properly normalized significand
  278 will have an integer portion equal to 1, the `zExp' input should be 1 less
  279 than the desired result exponent whenever `zSig' is a complete, normalized
  280 significand.
  281 -------------------------------------------------------------------------------
  282 */
  283 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
  284 {
  285 
  286     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
  287 
  288 }
  289 
  290 /*
  291 -------------------------------------------------------------------------------
  292 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  293 and significand `zSig', and returns the proper single-precision floating-
  294 point value corresponding to the abstract input.  Ordinarily, the abstract
  295 value is simply rounded and packed into the single-precision format, with
  296 the inexact exception raised if the abstract input cannot be represented
  297 exactly.  However, if the abstract value is too large, the overflow and
  298 inexact exceptions are raised and an infinity or maximal finite value is
  299 returned.  If the abstract value is too small, the input value is rounded to
  300 a subnormal number, and the underflow and inexact exceptions are raised if
  301 the abstract input cannot be represented exactly as a subnormal single-
  302 precision floating-point number.
  303     The input significand `zSig' has its binary point between bits 30
  304 and 29, which is 7 bits to the left of the usual location.  This shifted
  305 significand must be normalized or smaller.  If `zSig' is not normalized,
  306 `zExp' must be 0; in that case, the result returned is a subnormal number,
  307 and it must not require rounding.  In the usual case that `zSig' is
  308 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  309 The handling of underflow and overflow follows the IEC/IEEE Standard for
  310 Binary Floating-Point Arithmetic.
  311 -------------------------------------------------------------------------------
  312 */
  313 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
  314 {
  315     int8 roundingMode;
  316     flag roundNearestEven;
  317     int8 roundIncrement, roundBits;
  318     flag isTiny;
  319 
  320     roundingMode = float_rounding_mode();
  321     roundNearestEven = ( roundingMode == float_round_nearest_even );
  322     roundIncrement = 0x40;
  323     if ( ! roundNearestEven ) {
  324         if ( roundingMode == float_round_to_zero ) {
  325             roundIncrement = 0;
  326         }
  327         else {
  328             roundIncrement = 0x7F;
  329             if ( zSign ) {
  330                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  331             }
  332             else {
  333                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  334             }
  335         }
  336     }
  337     roundBits = zSig & 0x7F;
  338     if ( 0xFD <= (bits16) zExp ) {
  339         if (    ( 0xFD < zExp )
  340              || (    ( zExp == 0xFD )
  341                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
  342            ) {
  343             float_raise( float_flag_overflow | float_flag_inexact );
  344             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
  345         }
  346         if ( zExp < 0 ) {
  347             isTiny =
  348                    ( float_detect_tininess == float_tininess_before_rounding )
  349                 || ( zExp < -1 )
  350                 || ( zSig + roundIncrement < 0x80000000 );
  351             shift32RightJamming( zSig, - zExp, &zSig );
  352             zExp = 0;
  353             roundBits = zSig & 0x7F;
  354             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
  355         }
  356     }
  357     if ( roundBits ) float_set_inexact();
  358     zSig = ( zSig + roundIncrement )>>7;
  359     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
  360     if ( zSig == 0 ) zExp = 0;
  361     return packFloat32( zSign, zExp, zSig );
  362 
  363 }
  364 
  365 /*
  366 -------------------------------------------------------------------------------
  367 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  368 and significand `zSig', and returns the proper single-precision floating-
  369 point value corresponding to the abstract input.  This routine is just like
  370 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  371 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  372 floating-point exponent.
  373 -------------------------------------------------------------------------------
  374 */
  375 static float32
  376  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
  377 {
  378     int8 shiftCount;
  379 
  380     shiftCount = countLeadingZeros32( zSig ) - 1;
  381     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
  382 
  383 }
  384 
  385 /*
  386 -------------------------------------------------------------------------------
  387 Returns the fraction bits of the double-precision floating-point value `a'.
  388 -------------------------------------------------------------------------------
  389 */
  390 INLINE bits64 extractFloat64Frac( float64 a )
  391 {
  392 
  393     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
  394 
  395 }
  396 
  397 /*
  398 -------------------------------------------------------------------------------
  399 Returns the exponent bits of the double-precision floating-point value `a'.
  400 -------------------------------------------------------------------------------
  401 */
  402 INLINE int16 extractFloat64Exp( float64 a )
  403 {
  404 
  405     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
  406 
  407 }
  408 
  409 /*
  410 -------------------------------------------------------------------------------
  411 Returns the sign bit of the double-precision floating-point value `a'.
  412 -------------------------------------------------------------------------------
  413 */
  414 INLINE flag extractFloat64Sign( float64 a )
  415 {
  416 
  417     return FLOAT64_DEMANGLE(a)>>63;
  418 
  419 }
  420 
  421 /*
  422 -------------------------------------------------------------------------------
  423 Normalizes the subnormal double-precision floating-point value represented
  424 by the denormalized significand `aSig'.  The normalized exponent and
  425 significand are stored at the locations pointed to by `zExpPtr' and
  426 `zSigPtr', respectively.
  427 -------------------------------------------------------------------------------
  428 */
  429 static void
  430  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
  431 {
  432     int8 shiftCount;
  433 
  434     shiftCount = countLeadingZeros64( aSig ) - 11;
  435     *zSigPtr = aSig<<shiftCount;
  436     *zExpPtr = 1 - shiftCount;
  437 
  438 }
  439 
  440 /*
  441 -------------------------------------------------------------------------------
  442 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  443 double-precision floating-point value, returning the result.  After being
  444 shifted into the proper positions, the three fields are simply added
  445 together to form the result.  This means that any integer portion of `zSig'
  446 will be added into the exponent.  Since a properly normalized significand
  447 will have an integer portion equal to 1, the `zExp' input should be 1 less
  448 than the desired result exponent whenever `zSig' is a complete, normalized
  449 significand.
  450 -------------------------------------------------------------------------------
  451 */
  452 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
  453 {
  454 
  455     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
  456                            ( ( (bits64) zExp )<<52 ) + zSig );
  457 
  458 }
  459 
  460 /*
  461 -------------------------------------------------------------------------------
  462 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  463 and significand `zSig', and returns the proper double-precision floating-
  464 point value corresponding to the abstract input.  Ordinarily, the abstract
  465 value is simply rounded and packed into the double-precision format, with
  466 the inexact exception raised if the abstract input cannot be represented
  467 exactly.  However, if the abstract value is too large, the overflow and
  468 inexact exceptions are raised and an infinity or maximal finite value is
  469 returned.  If the abstract value is too small, the input value is rounded to
  470 a subnormal number, and the underflow and inexact exceptions are raised if
  471 the abstract input cannot be represented exactly as a subnormal double-
  472 precision floating-point number.
  473     The input significand `zSig' has its binary point between bits 62
  474 and 61, which is 10 bits to the left of the usual location.  This shifted
  475 significand must be normalized or smaller.  If `zSig' is not normalized,
  476 `zExp' must be 0; in that case, the result returned is a subnormal number,
  477 and it must not require rounding.  In the usual case that `zSig' is
  478 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  479 The handling of underflow and overflow follows the IEC/IEEE Standard for
  480 Binary Floating-Point Arithmetic.
  481 -------------------------------------------------------------------------------
  482 */
  483 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
  484 {
  485     int8 roundingMode;
  486     flag roundNearestEven;
  487     int16 roundIncrement, roundBits;
  488     flag isTiny;
  489 
  490     roundingMode = float_rounding_mode();
  491     roundNearestEven = ( roundingMode == float_round_nearest_even );
  492     roundIncrement = 0x200;
  493     if ( ! roundNearestEven ) {
  494         if ( roundingMode == float_round_to_zero ) {
  495             roundIncrement = 0;
  496         }
  497         else {
  498             roundIncrement = 0x3FF;
  499             if ( zSign ) {
  500                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  501             }
  502             else {
  503                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  504             }
  505         }
  506     }
  507     roundBits = zSig & 0x3FF;
  508     if ( 0x7FD <= (bits16) zExp ) {
  509         if (    ( 0x7FD < zExp )
  510              || (    ( zExp == 0x7FD )
  511                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
  512            ) {
  513             float_raise( float_flag_overflow | float_flag_inexact );
  514             return FLOAT64_MANGLE(
  515                 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
  516                 ( roundIncrement == 0 ));
  517         }
  518         if ( zExp < 0 ) {
  519             isTiny =
  520                    ( float_detect_tininess == float_tininess_before_rounding )
  521                 || ( zExp < -1 )
  522                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
  523             shift64RightJamming( zSig, - zExp, &zSig );
  524             zExp = 0;
  525             roundBits = zSig & 0x3FF;
  526             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
  527         }
  528     }
  529     if ( roundBits ) float_set_inexact();
  530     zSig = ( zSig + roundIncrement )>>10;
  531     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
  532     if ( zSig == 0 ) zExp = 0;
  533     return packFloat64( zSign, zExp, zSig );
  534 
  535 }
  536 
  537 /*
  538 -------------------------------------------------------------------------------
  539 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  540 and significand `zSig', and returns the proper double-precision floating-
  541 point value corresponding to the abstract input.  This routine is just like
  542 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
  543 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  544 floating-point exponent.
  545 -------------------------------------------------------------------------------
  546 */
  547 static float64
  548  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
  549 {
  550     int8 shiftCount;
  551 
  552     shiftCount = countLeadingZeros64( zSig ) - 1;
  553     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
  554 
  555 }
  556 
  557 #ifdef FLOATX80
  558 
  559 /*
  560 -------------------------------------------------------------------------------
  561 Returns the fraction bits of the extended double-precision floating-point
  562 value `a'.
  563 -------------------------------------------------------------------------------
  564 */
  565 INLINE bits64 extractFloatx80Frac( floatx80 a )
  566 {
  567 
  568     return a.low;
  569 
  570 }
  571 
  572 /*
  573 -------------------------------------------------------------------------------
  574 Returns the exponent bits of the extended double-precision floating-point
  575 value `a'.
  576 -------------------------------------------------------------------------------
  577 */
  578 INLINE int32 extractFloatx80Exp( floatx80 a )
  579 {
  580 
  581     return a.high & 0x7FFF;
  582 
  583 }
  584 
  585 /*
  586 -------------------------------------------------------------------------------
  587 Returns the sign bit of the extended double-precision floating-point value
  588 `a'.
  589 -------------------------------------------------------------------------------
  590 */
  591 INLINE flag extractFloatx80Sign( floatx80 a )
  592 {
  593 
  594     return a.high>>15;
  595 
  596 }
  597 
  598 /*
  599 -------------------------------------------------------------------------------
  600 Normalizes the subnormal extended double-precision floating-point value
  601 represented by the denormalized significand `aSig'.  The normalized exponent
  602 and significand are stored at the locations pointed to by `zExpPtr' and
  603 `zSigPtr', respectively.
  604 -------------------------------------------------------------------------------
  605 */
  606 static void
  607  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
  608 {
  609     int8 shiftCount;
  610 
  611     shiftCount = countLeadingZeros64( aSig );
  612     *zSigPtr = aSig<<shiftCount;
  613     *zExpPtr = 1 - shiftCount;
  614 
  615 }
  616 
  617 /*
  618 -------------------------------------------------------------------------------
  619 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  620 extended double-precision floating-point value, returning the result.
  621 -------------------------------------------------------------------------------
  622 */
  623 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
  624 {
  625     floatx80 z;
  626 
  627     z.low = zSig;
  628     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
  629     return z;
  630 
  631 }
  632 
  633 /*
  634 -------------------------------------------------------------------------------
  635 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  636 and extended significand formed by the concatenation of `zSig0' and `zSig1',
  637 and returns the proper extended double-precision floating-point value
  638 corresponding to the abstract input.  Ordinarily, the abstract value is
  639 rounded and packed into the extended double-precision format, with the
  640 inexact exception raised if the abstract input cannot be represented
  641 exactly.  However, if the abstract value is too large, the overflow and
  642 inexact exceptions are raised and an infinity or maximal finite value is
  643 returned.  If the abstract value is too small, the input value is rounded to
  644 a subnormal number, and the underflow and inexact exceptions are raised if
  645 the abstract input cannot be represented exactly as a subnormal extended
  646 double-precision floating-point number.
  647     If `roundingPrecision' is 32 or 64, the result is rounded to the same
  648 number of bits as single or double precision, respectively.  Otherwise, the
  649 result is rounded to the full precision of the extended double-precision
  650 format.
  651     The input significand must be normalized or smaller.  If the input
  652 significand is not normalized, `zExp' must be 0; in that case, the result
  653 returned is a subnormal number, and it must not require rounding.  The
  654 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  655 Floating-Point Arithmetic.
  656 -------------------------------------------------------------------------------
  657 */
  658 static floatx80
  659  roundAndPackFloatx80(
  660      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
  661  )
  662 {
  663     int8 roundingMode;
  664     flag roundNearestEven, increment, isTiny;
  665     int64 roundIncrement, roundMask, roundBits;
  666 
  667     roundingMode = float_rounding_mode();
  668     roundNearestEven = ( roundingMode == float_round_nearest_even );
  669     if ( roundingPrecision == 80 ) goto precision80;
  670     if ( roundingPrecision == 64 ) {
  671         roundIncrement = LIT64( 0x0000000000000400 );
  672         roundMask = LIT64( 0x00000000000007FF );
  673     }
  674     else if ( roundingPrecision == 32 ) {
  675         roundIncrement = LIT64( 0x0000008000000000 );
  676         roundMask = LIT64( 0x000000FFFFFFFFFF );
  677     }
  678     else {
  679         goto precision80;
  680     }
  681     zSig0 |= ( zSig1 != 0 );
  682     if ( ! roundNearestEven ) {
  683         if ( roundingMode == float_round_to_zero ) {
  684             roundIncrement = 0;
  685         }
  686         else {
  687             roundIncrement = roundMask;
  688             if ( zSign ) {
  689                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  690             }
  691             else {
  692                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  693             }
  694         }
  695     }
  696     roundBits = zSig0 & roundMask;
  697     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
  698         if (    ( 0x7FFE < zExp )
  699              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
  700            ) {
  701             goto overflow;
  702         }
  703         if ( zExp <= 0 ) {
  704             isTiny =
  705                    ( float_detect_tininess == float_tininess_before_rounding )
  706                 || ( zExp < 0 )
  707                 || ( zSig0 <= zSig0 + roundIncrement );
  708             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
  709             zExp = 0;
  710             roundBits = zSig0 & roundMask;
  711             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
  712             if ( roundBits ) float_set_inexact();
  713             zSig0 += roundIncrement;
  714             if ( (sbits64) zSig0 < 0 ) zExp = 1;
  715             roundIncrement = roundMask + 1;
  716             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
  717                 roundMask |= roundIncrement;
  718             }
  719             zSig0 &= ~ roundMask;
  720             return packFloatx80( zSign, zExp, zSig0 );
  721         }
  722     }
  723     if ( roundBits ) float_set_inexact();
  724     zSig0 += roundIncrement;
  725     if ( zSig0 < roundIncrement ) {
  726         ++zExp;
  727         zSig0 = LIT64( 0x8000000000000000 );
  728     }
  729     roundIncrement = roundMask + 1;
  730     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
  731         roundMask |= roundIncrement;
  732     }
  733     zSig0 &= ~ roundMask;
  734     if ( zSig0 == 0 ) zExp = 0;
  735     return packFloatx80( zSign, zExp, zSig0 );
  736  precision80:
  737     increment = ( (sbits64) zSig1 < 0 );
  738     if ( ! roundNearestEven ) {
  739         if ( roundingMode == float_round_to_zero ) {
  740             increment = 0;
  741         }
  742         else {
  743             if ( zSign ) {
  744                 increment = ( roundingMode == float_round_down ) && zSig1;
  745             }
  746             else {
  747                 increment = ( roundingMode == float_round_up ) && zSig1;
  748             }
  749         }
  750     }
  751     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
  752         if (    ( 0x7FFE < zExp )
  753              || (    ( zExp == 0x7FFE )
  754                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
  755                   && increment
  756                 )
  757            ) {
  758             roundMask = 0;
  759  overflow:
  760             float_raise( float_flag_overflow | float_flag_inexact );
  761             if (    ( roundingMode == float_round_to_zero )
  762                  || ( zSign && ( roundingMode == float_round_up ) )
  763                  || ( ! zSign && ( roundingMode == float_round_down ) )
  764                ) {
  765                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
  766             }
  767             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  768         }
  769         if ( zExp <= 0 ) {
  770             isTiny =
  771                    ( float_detect_tininess == float_tininess_before_rounding )
  772                 || ( zExp < 0 )
  773                 || ! increment
  774                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
  775             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
  776             zExp = 0;
  777             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
  778             if ( zSig1 ) float_set_inexact();
  779             if ( roundNearestEven ) {
  780                 increment = ( (sbits64) zSig1 < 0 );
  781             }
  782             else {
  783                 if ( zSign ) {
  784                     increment = ( roundingMode == float_round_down ) && zSig1;
  785                 }
  786                 else {
  787                     increment = ( roundingMode == float_round_up ) && zSig1;
  788                 }
  789             }
  790             if ( increment ) {
  791                 ++zSig0;
  792                 zSig0 &=
  793                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
  794                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
  795             }
  796             return packFloatx80( zSign, zExp, zSig0 );
  797         }
  798     }
  799     if ( zSig1 ) float_set_inexact();
  800     if ( increment ) {
  801         ++zSig0;
  802         if ( zSig0 == 0 ) {
  803             ++zExp;
  804             zSig0 = LIT64( 0x8000000000000000 );
  805         }
  806         else {
  807             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
  808         }
  809     }
  810     else {
  811         if ( zSig0 == 0 ) zExp = 0;
  812     }
  813     return packFloatx80( zSign, zExp, zSig0 );
  814 
  815 }
  816 
  817 /*
  818 -------------------------------------------------------------------------------
  819 Takes an abstract floating-point value having sign `zSign', exponent
  820 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  821 and returns the proper extended double-precision floating-point value
  822 corresponding to the abstract input.  This routine is just like
  823 `roundAndPackFloatx80' except that the input significand does not have to be
  824 normalized.
  825 -------------------------------------------------------------------------------
  826 */
  827 static floatx80
  828  normalizeRoundAndPackFloatx80(
  829      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
  830  )
  831 {
  832     int8 shiftCount;
  833 
  834     if ( zSig0 == 0 ) {
  835         zSig0 = zSig1;
  836         zSig1 = 0;
  837         zExp -= 64;
  838     }
  839     shiftCount = countLeadingZeros64( zSig0 );
  840     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
  841     zExp -= shiftCount;
  842     return
  843         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
  844 
  845 }
  846 
  847 #endif
  848 
  849 #ifdef FLOAT128
  850 
  851 /*
  852 -------------------------------------------------------------------------------
  853 Returns the least-significant 64 fraction bits of the quadruple-precision
  854 floating-point value `a'.
  855 -------------------------------------------------------------------------------
  856 */
  857 INLINE bits64 extractFloat128Frac1( float128 a )
  858 {
  859 
  860     return a.low;
  861 
  862 }
  863 
  864 /*
  865 -------------------------------------------------------------------------------
  866 Returns the most-significant 48 fraction bits of the quadruple-precision
  867 floating-point value `a'.
  868 -------------------------------------------------------------------------------
  869 */
  870 INLINE bits64 extractFloat128Frac0( float128 a )
  871 {
  872 
  873     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
  874 
  875 }
  876 
  877 /*
  878 -------------------------------------------------------------------------------
  879 Returns the exponent bits of the quadruple-precision floating-point value
  880 `a'.
  881 -------------------------------------------------------------------------------
  882 */
  883 INLINE int32 extractFloat128Exp( float128 a )
  884 {
  885 
  886     return ( a.high>>48 ) & 0x7FFF;
  887 
  888 }
  889 
  890 /*
  891 -------------------------------------------------------------------------------
  892 Returns the sign bit of the quadruple-precision floating-point value `a'.
  893 -------------------------------------------------------------------------------
  894 */
  895 INLINE flag extractFloat128Sign( float128 a )
  896 {
  897 
  898     return a.high>>63;
  899 
  900 }
  901 
  902 /*
  903 -------------------------------------------------------------------------------
  904 Normalizes the subnormal quadruple-precision floating-point value
  905 represented by the denormalized significand formed by the concatenation of
  906 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
  907 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
  908 significand are stored at the location pointed to by `zSig0Ptr', and the
  909 least significant 64 bits of the normalized significand are stored at the
  910 location pointed to by `zSig1Ptr'.
  911 -------------------------------------------------------------------------------
  912 */
  913 static void
  914  normalizeFloat128Subnormal(
  915      bits64 aSig0,
  916      bits64 aSig1,
  917      int32 *zExpPtr,
  918      bits64 *zSig0Ptr,
  919      bits64 *zSig1Ptr
  920  )
  921 {
  922     int8 shiftCount;
  923 
  924     if ( aSig0 == 0 ) {
  925         shiftCount = countLeadingZeros64( aSig1 ) - 15;
  926         if ( shiftCount < 0 ) {
  927             *zSig0Ptr = aSig1>>( - shiftCount );
  928             *zSig1Ptr = aSig1<<( shiftCount & 63 );
  929         }
  930         else {
  931             *zSig0Ptr = aSig1<<shiftCount;
  932             *zSig1Ptr = 0;
  933         }
  934         *zExpPtr = - shiftCount - 63;
  935     }
  936     else {
  937         shiftCount = countLeadingZeros64( aSig0 ) - 15;
  938         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  939         *zExpPtr = 1 - shiftCount;
  940     }
  941 
  942 }
  943 
  944 /*
  945 -------------------------------------------------------------------------------
  946 Packs the sign `zSign', the exponent `zExp', and the significand formed
  947 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  948 floating-point value, returning the result.  After being shifted into the
  949 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  950 added together to form the most significant 32 bits of the result.  This
  951 means that any integer portion of `zSig0' will be added into the exponent.
  952 Since a properly normalized significand will have an integer portion equal
  953 to 1, the `zExp' input should be 1 less than the desired result exponent
  954 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  955 significand.
  956 -------------------------------------------------------------------------------
  957 */
  958 INLINE float128
  959  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
  960 {
  961     float128 z;
  962 
  963     z.low = zSig1;
  964     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
  965     return z;
  966 
  967 }
  968 
  969 /*
  970 -------------------------------------------------------------------------------
  971 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  972 and extended significand formed by the concatenation of `zSig0', `zSig1',
  973 and `zSig2', and returns the proper quadruple-precision floating-point value
  974 corresponding to the abstract input.  Ordinarily, the abstract value is
  975 simply rounded and packed into the quadruple-precision format, with the
  976 inexact exception raised if the abstract input cannot be represented
  977 exactly.  However, if the abstract value is too large, the overflow and
  978 inexact exceptions are raised and an infinity or maximal finite value is
  979 returned.  If the abstract value is too small, the input value is rounded to
  980 a subnormal number, and the underflow and inexact exceptions are raised if
  981 the abstract input cannot be represented exactly as a subnormal quadruple-
  982 precision floating-point number.
  983     The input significand must be normalized or smaller.  If the input
  984 significand is not normalized, `zExp' must be 0; in that case, the result
  985 returned is a subnormal number, and it must not require rounding.  In the
  986 usual case that the input significand is normalized, `zExp' must be 1 less
  987 than the ``true'' floating-point exponent.  The handling of underflow and
  988 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  989 -------------------------------------------------------------------------------
  990 */
  991 static float128
  992  roundAndPackFloat128(
  993      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
  994 {
  995     int8 roundingMode;
  996     flag roundNearestEven, increment, isTiny;
  997 
  998     roundingMode = float_rounding_mode();
  999     roundNearestEven = ( roundingMode == float_round_nearest_even );
 1000     increment = ( (sbits64) zSig2 < 0 );
 1001     if ( ! roundNearestEven ) {
 1002         if ( roundingMode == float_round_to_zero ) {
 1003             increment = 0;
 1004         }
 1005         else {
 1006             if ( zSign ) {
 1007                 increment = ( roundingMode == float_round_down ) && zSig2;
 1008             }
 1009             else {
 1010                 increment = ( roundingMode == float_round_up ) && zSig2;
 1011             }
 1012         }
 1013     }
 1014     if ( 0x7FFD <= (bits32) zExp ) {
 1015         if (    ( 0x7FFD < zExp )
 1016              || (    ( zExp == 0x7FFD )
 1017                   && eq128(
 1018                          LIT64( 0x0001FFFFFFFFFFFF ),
 1019                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 1020                          zSig0,
 1021                          zSig1
 1022                      )
 1023                   && increment
 1024                 )
 1025            ) {
 1026             float_raise( float_flag_overflow | float_flag_inexact );
 1027             if (    ( roundingMode == float_round_to_zero )
 1028                  || ( zSign && ( roundingMode == float_round_up ) )
 1029                  || ( ! zSign && ( roundingMode == float_round_down ) )
 1030                ) {
 1031                 return
 1032                     packFloat128(
 1033                         zSign,
 1034                         0x7FFE,
 1035                         LIT64( 0x0000FFFFFFFFFFFF ),
 1036                         LIT64( 0xFFFFFFFFFFFFFFFF )
 1037                     );
 1038             }
 1039             return packFloat128( zSign, 0x7FFF, 0, 0 );
 1040         }
 1041         if ( zExp < 0 ) {
 1042             isTiny =
 1043                    ( float_detect_tininess == float_tininess_before_rounding )
 1044                 || ( zExp < -1 )
 1045                 || ! increment
 1046                 || lt128(
 1047                        zSig0,
 1048                        zSig1,
 1049                        LIT64( 0x0001FFFFFFFFFFFF ),
 1050                        LIT64( 0xFFFFFFFFFFFFFFFF )
 1051                    );
 1052             shift128ExtraRightJamming(
 1053                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 1054             zExp = 0;
 1055             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
 1056             if ( roundNearestEven ) {
 1057                 increment = ( (sbits64) zSig2 < 0 );
 1058             }
 1059             else {
 1060                 if ( zSign ) {
 1061                     increment = ( roundingMode == float_round_down ) && zSig2;
 1062                 }
 1063                 else {
 1064                     increment = ( roundingMode == float_round_up ) && zSig2;
 1065                 }
 1066             }
 1067         }
 1068     }
 1069     if ( zSig2 ) float_set_inexact();
 1070     if ( increment ) {
 1071         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
 1072         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
 1073     }
 1074     else {
 1075         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
 1076     }
 1077     return packFloat128( zSign, zExp, zSig0, zSig1 );
 1078 
 1079 }
 1080 
 1081 /*
 1082 -------------------------------------------------------------------------------
 1083 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 1084 and significand formed by the concatenation of `zSig0' and `zSig1', and
 1085 returns the proper quadruple-precision floating-point value corresponding
 1086 to the abstract input.  This routine is just like `roundAndPackFloat128'
 1087 except that the input significand has fewer bits and does not have to be
 1088 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 1089 point exponent.
 1090 -------------------------------------------------------------------------------
 1091 */
 1092 static float128
 1093  normalizeRoundAndPackFloat128(
 1094      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 1095 {
 1096     int8 shiftCount;
 1097     bits64 zSig2;
 1098 
 1099     if ( zSig0 == 0 ) {
 1100         zSig0 = zSig1;
 1101         zSig1 = 0;
 1102         zExp -= 64;
 1103     }
 1104     shiftCount = countLeadingZeros64( zSig0 ) - 15;
 1105     if ( 0 <= shiftCount ) {
 1106         zSig2 = 0;
 1107         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 1108     }
 1109     else {
 1110         shift128ExtraRightJamming(
 1111             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
 1112     }
 1113     zExp -= shiftCount;
 1114     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
 1115 
 1116 }
 1117 
 1118 #endif
 1119 
 1120 /*
 1121 -------------------------------------------------------------------------------
 1122 Returns the result of converting the 32-bit two's complement integer `a'
 1123 to the single-precision floating-point format.  The conversion is performed
 1124 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1125 -------------------------------------------------------------------------------
 1126 */
 1127 float32 int32_to_float32( int32 a )
 1128 {
 1129     flag zSign;
 1130 
 1131     if ( a == 0 ) return 0;
 1132     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
 1133     zSign = ( a < 0 );
 1134     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
 1135 
 1136 }
 1137 
 1138 /*
 1139 -------------------------------------------------------------------------------
 1140 Returns the result of converting the 32-bit two's complement integer `a'
 1141 to the double-precision floating-point format.  The conversion is performed
 1142 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1143 -------------------------------------------------------------------------------
 1144 */
 1145 float64 int32_to_float64( int32 a )
 1146 {
 1147     flag zSign;
 1148     uint32 absA;
 1149     int8 shiftCount;
 1150     bits64 zSig;
 1151 
 1152     if ( a == 0 ) return 0;
 1153     zSign = ( a < 0 );
 1154     absA = zSign ? - a : a;
 1155     shiftCount = countLeadingZeros32( absA ) + 21;
 1156     zSig = absA;
 1157     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
 1158 
 1159 }
 1160 
 1161 #ifdef FLOATX80
 1162 
 1163 /*
 1164 -------------------------------------------------------------------------------
 1165 Returns the result of converting the 32-bit two's complement integer `a'
 1166 to the extended double-precision floating-point format.  The conversion
 1167 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 1168 Arithmetic.
 1169 -------------------------------------------------------------------------------
 1170 */
 1171 floatx80 int32_to_floatx80( int32 a )
 1172 {
 1173     flag zSign;
 1174     uint32 absA;
 1175     int8 shiftCount;
 1176     bits64 zSig;
 1177 
 1178     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
 1179     zSign = ( a < 0 );
 1180     absA = zSign ? - a : a;
 1181     shiftCount = countLeadingZeros32( absA ) + 32;
 1182     zSig = absA;
 1183     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
 1184 
 1185 }
 1186 
 1187 #endif
 1188 
 1189 #ifdef FLOAT128
 1190 
 1191 /*
 1192 -------------------------------------------------------------------------------
 1193 Returns the result of converting the 32-bit two's complement integer `a' to
 1194 the quadruple-precision floating-point format.  The conversion is performed
 1195 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1196 -------------------------------------------------------------------------------
 1197 */
 1198 float128 int32_to_float128( int32 a )
 1199 {
 1200     flag zSign;
 1201     uint32 absA;
 1202     int8 shiftCount;
 1203     bits64 zSig0;
 1204 
 1205     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
 1206     zSign = ( a < 0 );
 1207     absA = zSign ? - a : a;
 1208     shiftCount = countLeadingZeros32( absA ) + 17;
 1209     zSig0 = absA;
 1210     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
 1211 
 1212 }
 1213 
 1214 #endif
 1215 
 1216 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
 1217 /*
 1218 -------------------------------------------------------------------------------
 1219 Returns the result of converting the 64-bit two's complement integer `a'
 1220 to the single-precision floating-point format.  The conversion is performed
 1221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1222 -------------------------------------------------------------------------------
 1223 */
 1224 float32 int64_to_float32( int64 a )
 1225 {
 1226     flag zSign;
 1227     uint64 absA;
 1228     int8 shiftCount;
 1229 
 1230     if ( a == 0 ) return 0;
 1231     zSign = ( a < 0 );
 1232     absA = zSign ? - a : a;
 1233     shiftCount = countLeadingZeros64( absA ) - 40;
 1234     if ( 0 <= shiftCount ) {
 1235         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
 1236     }
 1237     else {
 1238         shiftCount += 7;
 1239         if ( shiftCount < 0 ) {
 1240             shift64RightJamming( absA, - shiftCount, &absA );
 1241         }
 1242         else {
 1243             absA <<= shiftCount;
 1244         }
 1245         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
 1246     }
 1247 
 1248 }
 1249 
 1250 /*
 1251 -------------------------------------------------------------------------------
 1252 Returns the result of converting the 64-bit two's complement integer `a'
 1253 to the double-precision floating-point format.  The conversion is performed
 1254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1255 -------------------------------------------------------------------------------
 1256 */
 1257 float64 int64_to_float64( int64 a )
 1258 {
 1259     flag zSign;
 1260 
 1261     if ( a == 0 ) return 0;
 1262     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
 1263         return packFloat64( 1, 0x43E, 0 );
 1264     }
 1265     zSign = ( a < 0 );
 1266     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
 1267 
 1268 }
 1269 
 1270 #ifdef FLOATX80
 1271 
 1272 /*
 1273 -------------------------------------------------------------------------------
 1274 Returns the result of converting the 64-bit two's complement integer `a'
 1275 to the extended double-precision floating-point format.  The conversion
 1276 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 1277 Arithmetic.
 1278 -------------------------------------------------------------------------------
 1279 */
 1280 floatx80 int64_to_floatx80( int64 a )
 1281 {
 1282     flag zSign;
 1283     uint64 absA;
 1284     int8 shiftCount;
 1285 
 1286     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
 1287     zSign = ( a < 0 );
 1288     absA = zSign ? - a : a;
 1289     shiftCount = countLeadingZeros64( absA );
 1290     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
 1291 
 1292 }
 1293 
 1294 #endif
 1295 
 1296 #ifdef FLOAT128
 1297 
 1298 /*
 1299 -------------------------------------------------------------------------------
 1300 Returns the result of converting the 64-bit two's complement integer `a' to
 1301 the quadruple-precision floating-point format.  The conversion is performed
 1302 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1303 -------------------------------------------------------------------------------
 1304 */
 1305 float128 int64_to_float128( int64 a )
 1306 {
 1307     flag zSign;
 1308     uint64 absA;
 1309     int8 shiftCount;
 1310     int32 zExp;
 1311     bits64 zSig0, zSig1;
 1312 
 1313     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
 1314     zSign = ( a < 0 );
 1315     absA = zSign ? - a : a;
 1316     shiftCount = countLeadingZeros64( absA ) + 49;
 1317     zExp = 0x406E - shiftCount;
 1318     if ( 64 <= shiftCount ) {
 1319         zSig1 = 0;
 1320         zSig0 = absA;
 1321         shiftCount -= 64;
 1322     }
 1323     else {
 1324         zSig1 = absA;
 1325         zSig0 = 0;
 1326     }
 1327     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 1328     return packFloat128( zSign, zExp, zSig0, zSig1 );
 1329 
 1330 }
 1331 
 1332 #endif
 1333 #endif /* !SOFTFLOAT_FOR_GCC */
 1334 
 1335 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 1336 /*
 1337 -------------------------------------------------------------------------------
 1338 Returns the result of converting the single-precision floating-point value
 1339 `a' to the 32-bit two's complement integer format.  The conversion is
 1340 performed according to the IEC/IEEE Standard for Binary Floating-Point
 1341 Arithmetic---which means in particular that the conversion is rounded
 1342 according to the current rounding mode.  If `a' is a NaN, the largest
 1343 positive integer is returned.  Otherwise, if the conversion overflows, the
 1344 largest integer with the same sign as `a' is returned.
 1345 -------------------------------------------------------------------------------
 1346 */
 1347 int32 float32_to_int32( float32 a )
 1348 {
 1349     flag aSign;
 1350     int16 aExp, shiftCount;
 1351     bits32 aSig;
 1352     bits64 aSig64;
 1353 
 1354     aSig = extractFloat32Frac( a );
 1355     aExp = extractFloat32Exp( a );
 1356     aSign = extractFloat32Sign( a );
 1357     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
 1358     if ( aExp ) aSig |= 0x00800000;
 1359     shiftCount = 0xAF - aExp;
 1360     aSig64 = aSig;
 1361     aSig64 <<= 32;
 1362     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
 1363     return roundAndPackInt32( aSign, aSig64 );
 1364 
 1365 }
 1366 #endif /* !SOFTFLOAT_FOR_GCC */
 1367 
 1368 /*
 1369 -------------------------------------------------------------------------------
 1370 Returns the result of converting the single-precision floating-point value
 1371 `a' to the 32-bit two's complement integer format.  The conversion is
 1372 performed according to the IEC/IEEE Standard for Binary Floating-Point
 1373 Arithmetic, except that the conversion is always rounded toward zero.
 1374 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
 1375 the conversion overflows, the largest integer with the same sign as `a' is
 1376 returned.
 1377 -------------------------------------------------------------------------------
 1378 */
 1379 int32 float32_to_int32_round_to_zero( float32 a )
 1380 {
 1381     flag aSign;
 1382     int16 aExp, shiftCount;
 1383     bits32 aSig;
 1384     int32 z;
 1385 
 1386     aSig = extractFloat32Frac( a );
 1387     aExp = extractFloat32Exp( a );
 1388     aSign = extractFloat32Sign( a );
 1389     shiftCount = aExp - 0x9E;
 1390     if ( 0 <= shiftCount ) {
 1391         if ( a != 0xCF000000 ) {
 1392             float_raise( float_flag_invalid );
 1393             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
 1394         }
 1395         return (sbits32) 0x80000000;
 1396     }
 1397     else if ( aExp <= 0x7E ) {
 1398         if ( aExp | aSig ) float_set_inexact();
 1399         return 0;
 1400     }
 1401     aSig = ( aSig | 0x00800000 )<<8;
 1402     z = aSig>>( - shiftCount );
 1403     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
 1404         float_set_inexact();
 1405     }
 1406     if ( aSign ) z = - z;
 1407     return z;
 1408 
 1409 }
 1410 
 1411 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
 1412 /*
 1413 -------------------------------------------------------------------------------
 1414 Returns the result of converting the single-precision floating-point value
 1415 `a' to the 64-bit two's complement integer format.  The conversion is
 1416 performed according to the IEC/IEEE Standard for Binary Floating-Point
 1417 Arithmetic---which means in particular that the conversion is rounded
 1418 according to the current rounding mode.  If `a' is a NaN, the largest
 1419 positive integer is returned.  Otherwise, if the conversion overflows, the
 1420 largest integer with the same sign as `a' is returned.
 1421 -------------------------------------------------------------------------------
 1422 */
 1423 int64 float32_to_int64( float32 a )
 1424 {
 1425     flag aSign;
 1426     int16 aExp, shiftCount;
 1427     bits32 aSig;
 1428     bits64 aSig64, aSigExtra;
 1429 
 1430     aSig = extractFloat32Frac( a );
 1431     aExp = extractFloat32Exp( a );
 1432     aSign = extractFloat32Sign( a );
 1433     shiftCount = 0xBE - aExp;
 1434     if ( shiftCount < 0 ) {
 1435         float_raise( float_flag_invalid );
 1436         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
 1437             return LIT64( 0x7FFFFFFFFFFFFFFF );
 1438         }
 1439         return (sbits64) LIT64( 0x8000000000000000 );
 1440     }
 1441     if ( aExp ) aSig |= 0x00800000;
 1442     aSig64 = aSig;
 1443     aSig64 <<= 40;
 1444     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
 1445     return roundAndPackInt64( aSign, aSig64, aSigExtra );
 1446 
 1447 }
 1448 
 1449 /*
 1450 -------------------------------------------------------------------------------
 1451 Returns the result of converting the single-precision floating-point value
 1452 `a' to the 64-bit two's complement integer format.  The conversion is
 1453 performed according to the IEC/IEEE Standard for Binary Floating-Point
 1454 Arithmetic, except that the conversion is always rounded toward zero.  If
 1455 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
 1456 conversion overflows, the largest integer with the same sign as `a' is
 1457 returned.
 1458 -------------------------------------------------------------------------------
 1459 */
 1460 int64 float32_to_int64_round_to_zero( float32 a )
 1461 {
 1462     flag aSign;
 1463     int16 aExp, shiftCount;
 1464     bits32 aSig;
 1465     bits64 aSig64;
 1466     int64 z;
 1467 
 1468     aSig = extractFloat32Frac( a );
 1469     aExp = extractFloat32Exp( a );
 1470     aSign = extractFloat32Sign( a );
 1471     shiftCount = aExp - 0xBE;
 1472     if ( 0 <= shiftCount ) {
 1473         if ( a != 0xDF000000 ) {
 1474             float_raise( float_flag_invalid );
 1475             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
 1476                 return LIT64( 0x7FFFFFFFFFFFFFFF );
 1477             }
 1478         }
 1479         return (sbits64) LIT64( 0x8000000000000000 );
 1480     }
 1481     else if ( aExp <= 0x7E ) {
 1482         if ( aExp | aSig ) float_set_inexact();
 1483         return 0;
 1484     }
 1485     aSig64 = aSig | 0x00800000;
 1486     aSig64 <<= 40;
 1487     z = aSig64>>( - shiftCount );
 1488     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
 1489         float_set_inexact();
 1490     }
 1491     if ( aSign ) z = - z;
 1492     return z;
 1493 
 1494 }
 1495 #endif /* !SOFTFLOAT_FOR_GCC */
 1496 
 1497 /*
 1498 -------------------------------------------------------------------------------
 1499 Returns the result of converting the single-precision floating-point value
 1500 `a' to the double-precision floating-point format.  The conversion is
 1501 performed according to the IEC/IEEE Standard for Binary Floating-Point
 1502 Arithmetic.
 1503 -------------------------------------------------------------------------------
 1504 */
 1505 float64 float32_to_float64( float32 a )
 1506 {
 1507     flag aSign;
 1508     int16 aExp;
 1509     bits32 aSig;
 1510 
 1511     aSig = extractFloat32Frac( a );
 1512     aExp = extractFloat32Exp( a );
 1513     aSign = extractFloat32Sign( a );
 1514     if ( aExp == 0xFF ) {
 1515         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
 1516         return packFloat64( aSign, 0x7FF, 0 );
 1517     }
 1518     if ( aExp == 0 ) {
 1519         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
 1520         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 1521         --aExp;
 1522     }
 1523     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
 1524 
 1525 }
 1526 
 1527 #ifdef FLOATX80
 1528 
 1529 /*
 1530 -------------------------------------------------------------------------------
 1531 Returns the result of converting the single-precision floating-point value
 1532 `a' to the extended double-precision floating-point format.  The conversion
 1533 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 1534 Arithmetic.
 1535 -------------------------------------------------------------------------------
 1536 */
 1537 floatx80 float32_to_floatx80( float32 a )
 1538 {
 1539     flag aSign;
 1540     int16 aExp;
 1541     bits32 aSig;
 1542 
 1543     aSig = extractFloat32Frac( a );
 1544     aExp = extractFloat32Exp( a );
 1545     aSign = extractFloat32Sign( a );
 1546     if ( aExp == 0xFF ) {
 1547         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
 1548         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 1549     }
 1550     if ( aExp == 0 ) {
 1551         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
 1552         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 1553     }
 1554     aSig |= 0x00800000;
 1555     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
 1556 
 1557 }
 1558 
 1559 #endif
 1560 
 1561 #ifdef FLOAT128
 1562 
 1563 /*
 1564 -------------------------------------------------------------------------------
 1565 Returns the result of converting the single-precision floating-point value
 1566 `a' to the double-precision floating-point format.  The conversion is
 1567 performed according to the IEC/IEEE Standard for Binary Floating-Point
 1568 Arithmetic.
 1569 -------------------------------------------------------------------------------
 1570 */
 1571 float128 float32_to_float128( float32 a )
 1572 {
 1573     flag aSign;
 1574     int16 aExp;
 1575     bits32 aSig;
 1576 
 1577     aSig = extractFloat32Frac( a );
 1578     aExp = extractFloat32Exp( a );
 1579     aSign = extractFloat32Sign( a );
 1580     if ( aExp == 0xFF ) {
 1581         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
 1582         return packFloat128( aSign, 0x7FFF, 0, 0 );
 1583     }
 1584     if ( aExp == 0 ) {
 1585         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
 1586         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 1587         --aExp;
 1588     }
 1589     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
 1590 
 1591 }
 1592 
 1593 #endif
 1594 
 1595 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 1596 /*
 1597 -------------------------------------------------------------------------------
 1598 Rounds the single-precision floating-point value `a' to an integer, and
 1599 returns the result as a single-precision floating-point value.  The
 1600 operation is performed according to the IEC/IEEE Standard for Binary
 1601 Floating-Point Arithmetic.
 1602 -------------------------------------------------------------------------------
 1603 */
 1604 float32 float32_round_to_int( float32 a )
 1605 {
 1606     flag aSign;
 1607     int16 aExp;
 1608     bits32 lastBitMask, roundBitsMask;
 1609     int8 roundingMode;
 1610     float32 z;
 1611 
 1612     aExp = extractFloat32Exp( a );
 1613     if ( 0x96 <= aExp ) {
 1614         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
 1615             return propagateFloat32NaN( a, a );
 1616         }
 1617         return a;
 1618     }
 1619     if ( aExp <= 0x7E ) {
 1620         if ( (bits32) ( a<<1 ) == 0 ) return a;
 1621         float_set_inexact();
 1622         aSign = extractFloat32Sign( a );
 1623         switch ( float_rounding_mode() ) {
 1624          case float_round_nearest_even:
 1625             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
 1626                 return packFloat32( aSign, 0x7F, 0 );
 1627             }
 1628             break;
 1629          case float_round_down:
 1630             return aSign ? 0xBF800000 : 0;
 1631          case float_round_up:
 1632             return aSign ? 0x80000000 : 0x3F800000;
 1633         }
 1634         return packFloat32( aSign, 0, 0 );
 1635     }
 1636     lastBitMask = 1;
 1637     lastBitMask <<= 0x96 - aExp;
 1638     roundBitsMask = lastBitMask - 1;
 1639     z = a;
 1640     roundingMode = float_rounding_mode();
 1641     if ( roundingMode == float_round_nearest_even ) {
 1642         z += lastBitMask>>1;
 1643         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
 1644     }
 1645     else if ( roundingMode != float_round_to_zero ) {
 1646         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
 1647             z += roundBitsMask;
 1648         }
 1649     }
 1650     z &= ~ roundBitsMask;
 1651     if ( z != a ) float_set_inexact();
 1652     return z;
 1653 
 1654 }
 1655 #endif /* !SOFTFLOAT_FOR_GCC */
 1656 
 1657 /*
 1658 -------------------------------------------------------------------------------
 1659 Returns the result of adding the absolute values of the single-precision
 1660 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
 1661 before being returned.  `zSign' is ignored if the result is a NaN.
 1662 The addition is performed according to the IEC/IEEE Standard for Binary
 1663 Floating-Point Arithmetic.
 1664 -------------------------------------------------------------------------------
 1665 */
 1666 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
 1667 {
 1668     int16 aExp, bExp, zExp;
 1669     bits32 aSig, bSig, zSig;
 1670     int16 expDiff;
 1671 
 1672     aSig = extractFloat32Frac( a );
 1673     aExp = extractFloat32Exp( a );
 1674     bSig = extractFloat32Frac( b );
 1675     bExp = extractFloat32Exp( b );
 1676     expDiff = aExp - bExp;
 1677     aSig <<= 6;
 1678     bSig <<= 6;
 1679     if ( 0 < expDiff ) {
 1680         if ( aExp == 0xFF ) {
 1681             if ( aSig ) return propagateFloat32NaN( a, b );
 1682             return a;
 1683         }
 1684         if ( bExp == 0 ) {
 1685             --expDiff;
 1686         }
 1687         else {
 1688             bSig |= 0x20000000;
 1689         }
 1690         shift32RightJamming( bSig, expDiff, &bSig );
 1691         zExp = aExp;
 1692     }
 1693     else if ( expDiff < 0 ) {
 1694         if ( bExp == 0xFF ) {
 1695             if ( bSig ) return propagateFloat32NaN( a, b );
 1696             return packFloat32( zSign, 0xFF, 0 );
 1697         }
 1698         if ( aExp == 0 ) {
 1699             ++expDiff;
 1700         }
 1701         else {
 1702             aSig |= 0x20000000;
 1703         }
 1704         shift32RightJamming( aSig, - expDiff, &aSig );
 1705         zExp = bExp;
 1706     }
 1707     else {
 1708         if ( aExp == 0xFF ) {
 1709             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
 1710             return a;
 1711         }
 1712         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
 1713         zSig = 0x40000000 + aSig + bSig;
 1714         zExp = aExp;
 1715         goto roundAndPack;
 1716     }
 1717     aSig |= 0x20000000;
 1718     zSig = ( aSig + bSig )<<1;
 1719     --zExp;
 1720     if ( (sbits32) zSig < 0 ) {
 1721         zSig = aSig + bSig;
 1722         ++zExp;
 1723     }
 1724  roundAndPack:
 1725     return roundAndPackFloat32( zSign, zExp, zSig );
 1726 
 1727 }
 1728 
 1729 /*
 1730 -------------------------------------------------------------------------------
 1731 Returns the result of subtracting the absolute values of the single-
 1732 precision floating-point values `a' and `b'.  If `zSign' is 1, the
 1733 difference is negated before being returned.  `zSign' is ignored if the
 1734 result is a NaN.  The subtraction is performed according to the IEC/IEEE
 1735 Standard for Binary Floating-Point Arithmetic.
 1736 -------------------------------------------------------------------------------
 1737 */
 1738 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
 1739 {
 1740     int16 aExp, bExp, zExp;
 1741     bits32 aSig, bSig, zSig;
 1742     int16 expDiff;
 1743 
 1744     aSig = extractFloat32Frac( a );
 1745     aExp = extractFloat32Exp( a );
 1746     bSig = extractFloat32Frac( b );
 1747     bExp = extractFloat32Exp( b );
 1748     expDiff = aExp - bExp;
 1749     aSig <<= 7;
 1750     bSig <<= 7;
 1751     if ( 0 < expDiff ) goto aExpBigger;
 1752     if ( expDiff < 0 ) goto bExpBigger;
 1753     if ( aExp == 0xFF ) {
 1754         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
 1755         float_raise( float_flag_invalid );
 1756         return float32_default_nan;
 1757     }
 1758     if ( aExp == 0 ) {
 1759         aExp = 1;
 1760         bExp = 1;
 1761     }
 1762     if ( bSig < aSig ) goto aBigger;
 1763     if ( aSig < bSig ) goto bBigger;
 1764     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
 1765  bExpBigger:
 1766     if ( bExp == 0xFF ) {
 1767         if ( bSig ) return propagateFloat32NaN( a, b );
 1768         return packFloat32( zSign ^ 1, 0xFF, 0 );
 1769     }
 1770     if ( aExp == 0 ) {
 1771         ++expDiff;
 1772     }
 1773     else {
 1774         aSig |= 0x40000000;
 1775     }
 1776     shift32RightJamming( aSig, - expDiff, &aSig );
 1777     bSig |= 0x40000000;
 1778  bBigger:
 1779     zSig = bSig - aSig;
 1780     zExp = bExp;
 1781     zSign ^= 1;
 1782     goto normalizeRoundAndPack;
 1783  aExpBigger:
 1784     if ( aExp == 0xFF ) {
 1785         if ( aSig ) return propagateFloat32NaN( a, b );
 1786         return a;
 1787     }
 1788     if ( bExp == 0 ) {
 1789         --expDiff;
 1790     }
 1791     else {
 1792         bSig |= 0x40000000;
 1793     }
 1794     shift32RightJamming( bSig, expDiff, &bSig );
 1795     aSig |= 0x40000000;
 1796  aBigger:
 1797     zSig = aSig - bSig;
 1798     zExp = aExp;
 1799  normalizeRoundAndPack:
 1800     --zExp;
 1801     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
 1802 
 1803 }
 1804 
 1805 /*
 1806 -------------------------------------------------------------------------------
 1807 Returns the result of adding the single-precision floating-point values `a'
 1808 and `b'.  The operation is performed according to the IEC/IEEE Standard for
 1809 Binary Floating-Point Arithmetic.
 1810 -------------------------------------------------------------------------------
 1811 */
 1812 float32 float32_add( float32 a, float32 b )
 1813 {
 1814     flag aSign, bSign;
 1815 
 1816     aSign = extractFloat32Sign( a );
 1817     bSign = extractFloat32Sign( b );
 1818     if ( aSign == bSign ) {
 1819         return addFloat32Sigs( a, b, aSign );
 1820     }
 1821     else {
 1822         return subFloat32Sigs( a, b, aSign );
 1823     }
 1824 
 1825 }
 1826 
 1827 /*
 1828 -------------------------------------------------------------------------------
 1829 Returns the result of subtracting the single-precision floating-point values
 1830 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
 1831 for Binary Floating-Point Arithmetic.
 1832 -------------------------------------------------------------------------------
 1833 */
 1834 float32 float32_sub( float32 a, float32 b )
 1835 {
 1836     flag aSign, bSign;
 1837 
 1838     aSign = extractFloat32Sign( a );
 1839     bSign = extractFloat32Sign( b );
 1840     if ( aSign == bSign ) {
 1841         return subFloat32Sigs( a, b, aSign );
 1842     }
 1843     else {
 1844         return addFloat32Sigs( a, b, aSign );
 1845     }
 1846 
 1847 }
 1848 
 1849 /*
 1850 -------------------------------------------------------------------------------
 1851 Returns the result of multiplying the single-precision floating-point values
 1852 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
 1853 for Binary Floating-Point Arithmetic.
 1854 -------------------------------------------------------------------------------
 1855 */
 1856 float32 float32_mul( float32 a, float32 b )
 1857 {
 1858     flag aSign, bSign, zSign;
 1859     int16 aExp, bExp, zExp;
 1860     bits32 aSig, bSig;
 1861     bits64 zSig64;
 1862     bits32 zSig;
 1863 
 1864     aSig = extractFloat32Frac( a );
 1865     aExp = extractFloat32Exp( a );
 1866     aSign = extractFloat32Sign( a );
 1867     bSig = extractFloat32Frac( b );
 1868     bExp = extractFloat32Exp( b );
 1869     bSign = extractFloat32Sign( b );
 1870     zSign = aSign ^ bSign;
 1871     if ( aExp == 0xFF ) {
 1872         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
 1873             return propagateFloat32NaN( a, b );
 1874         }
 1875         if ( ( bExp | bSig ) == 0 ) {
 1876             float_raise( float_flag_invalid );
 1877             return float32_default_nan;
 1878         }
 1879         return packFloat32( zSign, 0xFF, 0 );
 1880     }
 1881     if ( bExp == 0xFF ) {
 1882         if ( bSig ) return propagateFloat32NaN( a, b );
 1883         if ( ( aExp | aSig ) == 0 ) {
 1884             float_raise( float_flag_invalid );
 1885             return float32_default_nan;
 1886         }
 1887         return packFloat32( zSign, 0xFF, 0 );
 1888     }
 1889     if ( aExp == 0 ) {
 1890         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
 1891         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 1892     }
 1893     if ( bExp == 0 ) {
 1894         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
 1895         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
 1896     }
 1897     zExp = aExp + bExp - 0x7F;
 1898     aSig = ( aSig | 0x00800000 )<<7;
 1899     bSig = ( bSig | 0x00800000 )<<8;
 1900     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
 1901     zSig = zSig64;
 1902     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
 1903         zSig <<= 1;
 1904         --zExp;
 1905     }
 1906     return roundAndPackFloat32( zSign, zExp, zSig );
 1907 
 1908 }
 1909 
 1910 /*
 1911 -------------------------------------------------------------------------------
 1912 Returns the result of dividing the single-precision floating-point value `a'
 1913 by the corresponding value `b'.  The operation is performed according to the
 1914 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1915 -------------------------------------------------------------------------------
 1916 */
 1917 float32 float32_div( float32 a, float32 b )
 1918 {
 1919     flag aSign, bSign, zSign;
 1920     int16 aExp, bExp, zExp;
 1921     bits32 aSig, bSig, zSig;
 1922 
 1923     aSig = extractFloat32Frac( a );
 1924     aExp = extractFloat32Exp( a );
 1925     aSign = extractFloat32Sign( a );
 1926     bSig = extractFloat32Frac( b );
 1927     bExp = extractFloat32Exp( b );
 1928     bSign = extractFloat32Sign( b );
 1929     zSign = aSign ^ bSign;
 1930     if ( aExp == 0xFF ) {
 1931         if ( aSig ) return propagateFloat32NaN( a, b );
 1932         if ( bExp == 0xFF ) {
 1933             if ( bSig ) return propagateFloat32NaN( a, b );
 1934             float_raise( float_flag_invalid );
 1935             return float32_default_nan;
 1936         }
 1937         return packFloat32( zSign, 0xFF, 0 );
 1938     }
 1939     if ( bExp == 0xFF ) {
 1940         if ( bSig ) return propagateFloat32NaN( a, b );
 1941         return packFloat32( zSign, 0, 0 );
 1942     }
 1943     if ( bExp == 0 ) {
 1944         if ( bSig == 0 ) {
 1945             if ( ( aExp | aSig ) == 0 ) {
 1946                 float_raise( float_flag_invalid );
 1947                 return float32_default_nan;
 1948             }
 1949             float_raise( float_flag_divbyzero );
 1950             return packFloat32( zSign, 0xFF, 0 );
 1951         }
 1952         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
 1953     }
 1954     if ( aExp == 0 ) {
 1955         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
 1956         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 1957     }
 1958     zExp = aExp - bExp + 0x7D;
 1959     aSig = ( aSig | 0x00800000 )<<7;
 1960     bSig = ( bSig | 0x00800000 )<<8;
 1961     if ( bSig <= ( aSig + aSig ) ) {
 1962         aSig >>= 1;
 1963         ++zExp;
 1964     }
 1965     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
 1966     if ( ( zSig & 0x3F ) == 0 ) {
 1967         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
 1968     }
 1969     return roundAndPackFloat32( zSign, zExp, zSig );
 1970 
 1971 }
 1972 
 1973 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 1974 /*
 1975 -------------------------------------------------------------------------------
 1976 Returns the remainder of the single-precision floating-point value `a'
 1977 with respect to the corresponding value `b'.  The operation is performed
 1978 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 1979 -------------------------------------------------------------------------------
 1980 */
 1981 float32 float32_rem( float32 a, float32 b )
 1982 {
 1983     flag aSign, bSign, zSign;
 1984     int16 aExp, bExp, expDiff;
 1985     bits32 aSig, bSig;
 1986     bits32 q;
 1987     bits64 aSig64, bSig64, q64;
 1988     bits32 alternateASig;
 1989     sbits32 sigMean;
 1990 
 1991     aSig = extractFloat32Frac( a );
 1992     aExp = extractFloat32Exp( a );
 1993     aSign = extractFloat32Sign( a );
 1994     bSig = extractFloat32Frac( b );
 1995     bExp = extractFloat32Exp( b );
 1996     bSign = extractFloat32Sign( b );
 1997     if ( aExp == 0xFF ) {
 1998         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
 1999             return propagateFloat32NaN( a, b );
 2000         }
 2001         float_raise( float_flag_invalid );
 2002         return float32_default_nan;
 2003     }
 2004     if ( bExp == 0xFF ) {
 2005         if ( bSig ) return propagateFloat32NaN( a, b );
 2006         return a;
 2007     }
 2008     if ( bExp == 0 ) {
 2009         if ( bSig == 0 ) {
 2010             float_raise( float_flag_invalid );
 2011             return float32_default_nan;
 2012         }
 2013         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
 2014     }
 2015     if ( aExp == 0 ) {
 2016         if ( aSig == 0 ) return a;
 2017         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 2018     }
 2019     expDiff = aExp - bExp;
 2020     aSig |= 0x00800000;
 2021     bSig |= 0x00800000;
 2022     if ( expDiff < 32 ) {
 2023         aSig <<= 8;
 2024         bSig <<= 8;
 2025         if ( expDiff < 0 ) {
 2026             if ( expDiff < -1 ) return a;
 2027             aSig >>= 1;
 2028         }
 2029         q = ( bSig <= aSig );
 2030         if ( q ) aSig -= bSig;
 2031         if ( 0 < expDiff ) {
 2032             q = ( ( (bits64) aSig )<<32 ) / bSig;
 2033             q >>= 32 - expDiff;
 2034             bSig >>= 2;
 2035             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
 2036         }
 2037         else {
 2038             aSig >>= 2;
 2039             bSig >>= 2;
 2040         }
 2041     }
 2042     else {
 2043         if ( bSig <= aSig ) aSig -= bSig;
 2044         aSig64 = ( (bits64) aSig )<<40;
 2045         bSig64 = ( (bits64) bSig )<<40;
 2046         expDiff -= 64;
 2047         while ( 0 < expDiff ) {
 2048             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
 2049             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
 2050             aSig64 = - ( ( bSig * q64 )<<38 );
 2051             expDiff -= 62;
 2052         }
 2053         expDiff += 64;
 2054         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
 2055         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
 2056         q = q64>>( 64 - expDiff );
 2057         bSig <<= 6;
 2058         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
 2059     }
 2060     do {
 2061         alternateASig = aSig;
 2062         ++q;
 2063         aSig -= bSig;
 2064     } while ( 0 <= (sbits32) aSig );
 2065     sigMean = aSig + alternateASig;
 2066     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
 2067         aSig = alternateASig;
 2068     }
 2069     zSign = ( (sbits32) aSig < 0 );
 2070     if ( zSign ) aSig = - aSig;
 2071     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
 2072 
 2073 }
 2074 #endif /* !SOFTFLOAT_FOR_GCC */
 2075 
 2076 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 2077 /*
 2078 -------------------------------------------------------------------------------
 2079 Returns the square root of the single-precision floating-point value `a'.
 2080 The operation is performed according to the IEC/IEEE Standard for Binary
 2081 Floating-Point Arithmetic.
 2082 -------------------------------------------------------------------------------
 2083 */
 2084 float32 float32_sqrt( float32 a )
 2085 {
 2086     flag aSign;
 2087     int16 aExp, zExp;
 2088     bits32 aSig, zSig;
 2089     bits64 rem, term;
 2090 
 2091     aSig = extractFloat32Frac( a );
 2092     aExp = extractFloat32Exp( a );
 2093     aSign = extractFloat32Sign( a );
 2094     if ( aExp == 0xFF ) {
 2095         if ( aSig ) return propagateFloat32NaN( a, 0 );
 2096         if ( ! aSign ) return a;
 2097         float_raise( float_flag_invalid );
 2098         return float32_default_nan;
 2099     }
 2100     if ( aSign ) {
 2101         if ( ( aExp | aSig ) == 0 ) return a;
 2102         float_raise( float_flag_invalid );
 2103         return float32_default_nan;
 2104     }
 2105     if ( aExp == 0 ) {
 2106         if ( aSig == 0 ) return 0;
 2107         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 2108     }
 2109     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
 2110     aSig = ( aSig | 0x00800000 )<<8;
 2111     zSig = estimateSqrt32( aExp, aSig ) + 2;
 2112     if ( ( zSig & 0x7F ) <= 5 ) {
 2113         if ( zSig < 2 ) {
 2114             zSig = 0x7FFFFFFF;
 2115             goto roundAndPack;
 2116         }
 2117         aSig >>= aExp & 1;
 2118         term = ( (bits64) zSig ) * zSig;
 2119         rem = ( ( (bits64) aSig )<<32 ) - term;
 2120         while ( (sbits64) rem < 0 ) {
 2121             --zSig;
 2122             rem += ( ( (bits64) zSig )<<1 ) | 1;
 2123         }
 2124         zSig |= ( rem != 0 );
 2125     }
 2126     shift32RightJamming( zSig, 1, &zSig );
 2127  roundAndPack:
 2128     return roundAndPackFloat32( 0, zExp, zSig );
 2129 
 2130 }
 2131 #endif /* !SOFTFLOAT_FOR_GCC */
 2132 
 2133 /*
 2134 -------------------------------------------------------------------------------
 2135 Returns 1 if the single-precision floating-point value `a' is equal to
 2136 the corresponding value `b', and 0 otherwise.  The comparison is performed
 2137 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 2138 -------------------------------------------------------------------------------
 2139 */
 2140 flag float32_eq( float32 a, float32 b )
 2141 {
 2142 
 2143     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
 2144          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
 2145        ) {
 2146         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
 2147             float_raise( float_flag_invalid );
 2148         }
 2149         return 0;
 2150     }
 2151     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
 2152 
 2153 }
 2154 
 2155 /*
 2156 -------------------------------------------------------------------------------
 2157 Returns 1 if the single-precision floating-point value `a' is less than
 2158 or equal to the corresponding value `b', and 0 otherwise.  The comparison
 2159 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 2160 Arithmetic.
 2161 -------------------------------------------------------------------------------
 2162 */
 2163 flag float32_le( float32 a, float32 b )
 2164 {
 2165     flag aSign, bSign;
 2166 
 2167     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
 2168          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
 2169        ) {
 2170         float_raise( float_flag_invalid );
 2171         return 0;
 2172     }
 2173     aSign = extractFloat32Sign( a );
 2174     bSign = extractFloat32Sign( b );
 2175     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
 2176     return ( a == b ) || ( aSign ^ ( a < b ) );
 2177 
 2178 }
 2179 
 2180 /*
 2181 -------------------------------------------------------------------------------
 2182 Returns 1 if the single-precision floating-point value `a' is less than
 2183 the corresponding value `b', and 0 otherwise.  The comparison is performed
 2184 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 2185 -------------------------------------------------------------------------------
 2186 */
 2187 flag float32_lt( float32 a, float32 b )
 2188 {
 2189     flag aSign, bSign;
 2190 
 2191     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
 2192          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
 2193        ) {
 2194         float_raise( float_flag_invalid );
 2195         return 0;
 2196     }
 2197     aSign = extractFloat32Sign( a );
 2198     bSign = extractFloat32Sign( b );
 2199     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
 2200     return ( a != b ) && ( aSign ^ ( a < b ) );
 2201 
 2202 }
 2203 
 2204 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 2205 /*
 2206 -------------------------------------------------------------------------------
 2207 Returns 1 if the single-precision floating-point value `a' is equal to
 2208 the corresponding value `b', and 0 otherwise.  The invalid exception is
 2209 raised if either operand is a NaN.  Otherwise, the comparison is performed
 2210 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 2211 -------------------------------------------------------------------------------
 2212 */
 2213 flag float32_eq_signaling( float32 a, float32 b )
 2214 {
 2215 
 2216     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
 2217          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
 2218        ) {
 2219         float_raise( float_flag_invalid );
 2220         return 0;
 2221     }
 2222     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
 2223 
 2224 }
 2225 
 2226 /*
 2227 -------------------------------------------------------------------------------
 2228 Returns 1 if the single-precision floating-point value `a' is less than or
 2229 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
 2230 cause an exception.  Otherwise, the comparison is performed according to the
 2231 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 2232 -------------------------------------------------------------------------------
 2233 */
 2234 flag float32_le_quiet( float32 a, float32 b )
 2235 {
 2236     flag aSign, bSign;
 2237 
 2238     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
 2239          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
 2240        ) {
 2241         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
 2242             float_raise( float_flag_invalid );
 2243         }
 2244         return 0;
 2245     }
 2246     aSign = extractFloat32Sign( a );
 2247     bSign = extractFloat32Sign( b );
 2248     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
 2249     return ( a == b ) || ( aSign ^ ( a < b ) );
 2250 
 2251 }
 2252 
 2253 /*
 2254 -------------------------------------------------------------------------------
 2255 Returns 1 if the single-precision floating-point value `a' is less than
 2256 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
 2257 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
 2258 Standard for Binary Floating-Point Arithmetic.
 2259 -------------------------------------------------------------------------------
 2260 */
 2261 flag float32_lt_quiet( float32 a, float32 b )
 2262 {
 2263     flag aSign, bSign;
 2264 
 2265     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
 2266          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
 2267        ) {
 2268         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
 2269             float_raise( float_flag_invalid );
 2270         }
 2271         return 0;
 2272     }
 2273     aSign = extractFloat32Sign( a );
 2274     bSign = extractFloat32Sign( b );
 2275     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
 2276     return ( a != b ) && ( aSign ^ ( a < b ) );
 2277 
 2278 }
 2279 #endif /* !SOFTFLOAT_FOR_GCC */
 2280 
 2281 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 2282 /*
 2283 -------------------------------------------------------------------------------
 2284 Returns the result of converting the double-precision floating-point value
 2285 `a' to the 32-bit two's complement integer format.  The conversion is
 2286 performed according to the IEC/IEEE Standard for Binary Floating-Point
 2287 Arithmetic---which means in particular that the conversion is rounded
 2288 according to the current rounding mode.  If `a' is a NaN, the largest
 2289 positive integer is returned.  Otherwise, if the conversion overflows, the
 2290 largest integer with the same sign as `a' is returned.
 2291 -------------------------------------------------------------------------------
 2292 */
 2293 int32 float64_to_int32( float64 a )
 2294 {
 2295     flag aSign;
 2296     int16 aExp, shiftCount;
 2297     bits64 aSig;
 2298 
 2299     aSig = extractFloat64Frac( a );
 2300     aExp = extractFloat64Exp( a );
 2301     aSign = extractFloat64Sign( a );
 2302     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
 2303     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
 2304     shiftCount = 0x42C - aExp;
 2305     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
 2306     return roundAndPackInt32( aSign, aSig );
 2307 
 2308 }
 2309 #endif /* !SOFTFLOAT_FOR_GCC */
 2310 
 2311 /*
 2312 -------------------------------------------------------------------------------
 2313 Returns the result of converting the double-precision floating-point value
 2314 `a' to the 32-bit two's complement integer format.  The conversion is
 2315 performed according to the IEC/IEEE Standard for Binary Floating-Point
 2316 Arithmetic, except that the conversion is always rounded toward zero.
 2317 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
 2318 the conversion overflows, the largest integer with the same sign as `a' is
 2319 returned.
 2320 -------------------------------------------------------------------------------
 2321 */
 2322 int32 float64_to_int32_round_to_zero( float64 a )
 2323 {
 2324     flag aSign;
 2325     int16 aExp, shiftCount;
 2326     bits64 aSig, savedASig;
 2327     int32 z;
 2328 
 2329     aSig = extractFloat64Frac( a );
 2330     aExp = extractFloat64Exp( a );
 2331     aSign = extractFloat64Sign( a );
 2332     if ( 0x41E < aExp ) {
 2333         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
 2334         goto invalid;
 2335     }
 2336     else if ( aExp < 0x3FF ) {
 2337         if ( aExp || aSig ) float_set_inexact();
 2338         return 0;
 2339     }
 2340     aSig |= LIT64( 0x0010000000000000 );
 2341     shiftCount = 0x433 - aExp;
 2342     savedASig = aSig;
 2343     aSig >>= shiftCount;
 2344     z = aSig;
 2345     if ( aSign ) z = - z;
 2346     if ( ( z < 0 ) ^ aSign ) {
 2347  invalid:
 2348         float_raise( float_flag_invalid );
 2349         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 2350     }
 2351     if ( ( aSig<<shiftCount ) != savedASig ) {
 2352         float_set_inexact();
 2353     }
 2354     return z;
 2355 
 2356 }
 2357 
 2358 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
 2359 /*
 2360 -------------------------------------------------------------------------------
 2361 Returns the result of converting the double-precision floating-point value
 2362 `a' to the 64-bit two's complement integer format.  The conversion is
 2363 performed according to the IEC/IEEE Standard for Binary Floating-Point
 2364 Arithmetic---which means in particular that the conversion is rounded
 2365 according to the current rounding mode.  If `a' is a NaN, the largest
 2366 positive integer is returned.  Otherwise, if the conversion overflows, the
 2367 largest integer with the same sign as `a' is returned.
 2368 -------------------------------------------------------------------------------
 2369 */
 2370 int64 float64_to_int64( float64 a )
 2371 {
 2372     flag aSign;
 2373     int16 aExp, shiftCount;
 2374     bits64 aSig, aSigExtra;
 2375 
 2376     aSig = extractFloat64Frac( a );
 2377     aExp = extractFloat64Exp( a );
 2378     aSign = extractFloat64Sign( a );
 2379     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
 2380     shiftCount = 0x433 - aExp;
 2381     if ( shiftCount <= 0 ) {
 2382         if ( 0x43E < aExp ) {
 2383             float_raise( float_flag_invalid );
 2384             if (    ! aSign
 2385                  || (    ( aExp == 0x7FF )
 2386                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
 2387                ) {
 2388                 return LIT64( 0x7FFFFFFFFFFFFFFF );
 2389             }
 2390             return (sbits64) LIT64( 0x8000000000000000 );
 2391         }
 2392         aSigExtra = 0;
 2393         aSig <<= - shiftCount;
 2394     }
 2395     else {
 2396         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
 2397     }
 2398     return roundAndPackInt64( aSign, aSig, aSigExtra );
 2399 
 2400 }
 2401 
 2402 /*
 2403 -------------------------------------------------------------------------------
 2404 Returns the result of converting the double-precision floating-point value
 2405 `a' to the 64-bit two's complement integer format.  The conversion is
 2406 performed according to the IEC/IEEE Standard for Binary Floating-Point
 2407 Arithmetic, except that the conversion is always rounded toward zero.
 2408 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
 2409 the conversion overflows, the largest integer with the same sign as `a' is
 2410 returned.
 2411 -------------------------------------------------------------------------------
 2412 */
 2413 int64 float64_to_int64_round_to_zero( float64 a )
 2414 {
 2415     flag aSign;
 2416     int16 aExp, shiftCount;
 2417     bits64 aSig;
 2418     int64 z;
 2419 
 2420     aSig = extractFloat64Frac( a );
 2421     aExp = extractFloat64Exp( a );
 2422     aSign = extractFloat64Sign( a );
 2423     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
 2424     shiftCount = aExp - 0x433;
 2425     if ( 0 <= shiftCount ) {
 2426         if ( 0x43E <= aExp ) {
 2427             if ( a != LIT64( 0xC3E0000000000000 ) ) {
 2428                 float_raise( float_flag_invalid );
 2429                 if (    ! aSign
 2430                      || (    ( aExp == 0x7FF )
 2431                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
 2432                    ) {
 2433                     return LIT64( 0x7FFFFFFFFFFFFFFF );
 2434                 }
 2435             }
 2436             return (sbits64) LIT64( 0x8000000000000000 );
 2437         }
 2438         z = aSig<<shiftCount;
 2439     }
 2440     else {
 2441         if ( aExp < 0x3FE ) {
 2442             if ( aExp | aSig ) float_set_inexact();
 2443             return 0;
 2444         }
 2445         z = aSig>>( - shiftCount );
 2446         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
 2447             float_set_inexact();
 2448         }
 2449     }
 2450     if ( aSign ) z = - z;
 2451     return z;
 2452 
 2453 }
 2454 #endif /* !SOFTFLOAT_FOR_GCC */
 2455 
 2456 /*
 2457 -------------------------------------------------------------------------------
 2458 Returns the result of converting the double-precision floating-point value
 2459 `a' to the single-precision floating-point format.  The conversion is
 2460 performed according to the IEC/IEEE Standard for Binary Floating-Point
 2461 Arithmetic.
 2462 -------------------------------------------------------------------------------
 2463 */
 2464 float32 float64_to_float32( float64 a )
 2465 {
 2466     flag aSign;
 2467     int16 aExp;
 2468     bits64 aSig;
 2469     bits32 zSig;
 2470 
 2471     aSig = extractFloat64Frac( a );
 2472     aExp = extractFloat64Exp( a );
 2473     aSign = extractFloat64Sign( a );
 2474     if ( aExp == 0x7FF ) {
 2475         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
 2476         return packFloat32( aSign, 0xFF, 0 );
 2477     }
 2478     shift64RightJamming( aSig, 22, &aSig );
 2479     zSig = aSig;
 2480     if ( aExp || zSig ) {
 2481         zSig |= 0x40000000;
 2482         aExp -= 0x381;
 2483     }
 2484     return roundAndPackFloat32( aSign, aExp, zSig );
 2485 
 2486 }
 2487 
 2488 #ifdef FLOATX80
 2489 
 2490 /*
 2491 -------------------------------------------------------------------------------
 2492 Returns the result of converting the double-precision floating-point value
 2493 `a' to the extended double-precision floating-point format.  The conversion
 2494 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 2495 Arithmetic.
 2496 -------------------------------------------------------------------------------
 2497 */
 2498 floatx80 float64_to_floatx80( float64 a )
 2499 {
 2500     flag aSign;
 2501     int16 aExp;
 2502     bits64 aSig;
 2503 
 2504     aSig = extractFloat64Frac( a );
 2505     aExp = extractFloat64Exp( a );
 2506     aSign = extractFloat64Sign( a );
 2507     if ( aExp == 0x7FF ) {
 2508         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
 2509         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 2510     }
 2511     if ( aExp == 0 ) {
 2512         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
 2513         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 2514     }
 2515     return
 2516         packFloatx80(
 2517             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
 2518 
 2519 }
 2520 
 2521 #endif
 2522 
 2523 #ifdef FLOAT128
 2524 
 2525 /*
 2526 -------------------------------------------------------------------------------
 2527 Returns the result of converting the double-precision floating-point value
 2528 `a' to the quadruple-precision floating-point format.  The conversion is
 2529 performed according to the IEC/IEEE Standard for Binary Floating-Point
 2530 Arithmetic.
 2531 -------------------------------------------------------------------------------
 2532 */
 2533 float128 float64_to_float128( float64 a )
 2534 {
 2535     flag aSign;
 2536     int16 aExp;
 2537     bits64 aSig, zSig0, zSig1;
 2538 
 2539     aSig = extractFloat64Frac( a );
 2540     aExp = extractFloat64Exp( a );
 2541     aSign = extractFloat64Sign( a );
 2542     if ( aExp == 0x7FF ) {
 2543         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
 2544         return packFloat128( aSign, 0x7FFF, 0, 0 );
 2545     }
 2546     if ( aExp == 0 ) {
 2547         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
 2548         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 2549         --aExp;
 2550     }
 2551     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
 2552     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
 2553 
 2554 }
 2555 
 2556 #endif
 2557 
 2558 #ifndef SOFTFLOAT_FOR_GCC
 2559 /*
 2560 -------------------------------------------------------------------------------
 2561 Rounds the double-precision floating-point value `a' to an integer, and
 2562 returns the result as a double-precision floating-point value.  The
 2563 operation is performed according to the IEC/IEEE Standard for Binary
 2564 Floating-Point Arithmetic.
 2565 -------------------------------------------------------------------------------
 2566 */
 2567 float64 float64_round_to_int( float64 a )
 2568 {
 2569     flag aSign;
 2570     int16 aExp;
 2571     bits64 lastBitMask, roundBitsMask;
 2572     int8 roundingMode;
 2573     float64 z;
 2574 
 2575     aExp = extractFloat64Exp( a );
 2576     if ( 0x433 <= aExp ) {
 2577         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
 2578             return propagateFloat64NaN( a, a );
 2579         }
 2580         return a;
 2581     }
 2582     if ( aExp < 0x3FF ) {
 2583         if ( (bits64) ( a<<1 ) == 0 ) return a;
 2584         float_set_inexact();
 2585         aSign = extractFloat64Sign( a );
 2586         switch ( float_rounding_mode() ) {
 2587          case float_round_nearest_even:
 2588             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
 2589                 return packFloat64( aSign, 0x3FF, 0 );
 2590             }
 2591             break;
 2592          case float_round_down:
 2593             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
 2594          case float_round_up:
 2595             return
 2596             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
 2597         }
 2598         return packFloat64( aSign, 0, 0 );
 2599     }
 2600     lastBitMask = 1;
 2601     lastBitMask <<= 0x433 - aExp;
 2602     roundBitsMask = lastBitMask - 1;
 2603     z = a;
 2604     roundingMode = float_rounding_mode();
 2605     if ( roundingMode == float_round_nearest_even ) {
 2606         z += lastBitMask>>1;
 2607         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
 2608     }
 2609     else if ( roundingMode != float_round_to_zero ) {
 2610         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
 2611             z += roundBitsMask;
 2612         }
 2613     }
 2614     z &= ~ roundBitsMask;
 2615     if ( z != a ) float_set_inexact();
 2616     return z;
 2617 
 2618 }
 2619 #endif
 2620 
 2621 /*
 2622 -------------------------------------------------------------------------------
 2623 Returns the result of adding the absolute values of the double-precision
 2624 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
 2625 before being returned.  `zSign' is ignored if the result is a NaN.
 2626 The addition is performed according to the IEC/IEEE Standard for Binary
 2627 Floating-Point Arithmetic.
 2628 -------------------------------------------------------------------------------
 2629 */
 2630 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
 2631 {
 2632     int16 aExp, bExp, zExp;
 2633     bits64 aSig, bSig, zSig;
 2634     int16 expDiff;
 2635 
 2636     aSig = extractFloat64Frac( a );
 2637     aExp = extractFloat64Exp( a );
 2638     bSig = extractFloat64Frac( b );
 2639     bExp = extractFloat64Exp( b );
 2640     expDiff = aExp - bExp;
 2641     aSig <<= 9;
 2642     bSig <<= 9;
 2643     if ( 0 < expDiff ) {
 2644         if ( aExp == 0x7FF ) {
 2645             if ( aSig ) return propagateFloat64NaN( a, b );
 2646             return a;
 2647         }
 2648         if ( bExp == 0 ) {
 2649             --expDiff;
 2650         }
 2651         else {
 2652             bSig |= LIT64( 0x2000000000000000 );
 2653         }
 2654         shift64RightJamming( bSig, expDiff, &bSig );
 2655         zExp = aExp;
 2656     }
 2657     else if ( expDiff < 0 ) {
 2658         if ( bExp == 0x7FF ) {
 2659             if ( bSig ) return propagateFloat64NaN( a, b );
 2660             return packFloat64( zSign, 0x7FF, 0 );
 2661         }
 2662         if ( aExp == 0 ) {
 2663             ++expDiff;
 2664         }
 2665         else {
 2666             aSig |= LIT64( 0x2000000000000000 );
 2667         }
 2668         shift64RightJamming( aSig, - expDiff, &aSig );
 2669         zExp = bExp;
 2670     }
 2671     else {
 2672         if ( aExp == 0x7FF ) {
 2673             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
 2674             return a;
 2675         }
 2676         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
 2677         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
 2678         zExp = aExp;
 2679         goto roundAndPack;
 2680     }
 2681     aSig |= LIT64( 0x2000000000000000 );
 2682     zSig = ( aSig + bSig )<<1;
 2683     --zExp;
 2684     if ( (sbits64) zSig < 0 ) {
 2685         zSig = aSig + bSig;
 2686         ++zExp;
 2687     }
 2688  roundAndPack:
 2689     return roundAndPackFloat64( zSign, zExp, zSig );
 2690 
 2691 }
 2692 
 2693 /*
 2694 -------------------------------------------------------------------------------
 2695 Returns the result of subtracting the absolute values of the double-
 2696 precision floating-point values `a' and `b'.  If `zSign' is 1, the
 2697 difference is negated before being returned.  `zSign' is ignored if the
 2698 result is a NaN.  The subtraction is performed according to the IEC/IEEE
 2699 Standard for Binary Floating-Point Arithmetic.
 2700 -------------------------------------------------------------------------------
 2701 */
 2702 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
 2703 {
 2704     int16 aExp, bExp, zExp;
 2705     bits64 aSig, bSig, zSig;
 2706     int16 expDiff;
 2707 
 2708     aSig = extractFloat64Frac( a );
 2709     aExp = extractFloat64Exp( a );
 2710     bSig = extractFloat64Frac( b );
 2711     bExp = extractFloat64Exp( b );
 2712     expDiff = aExp - bExp;
 2713     aSig <<= 10;
 2714     bSig <<= 10;
 2715     if ( 0 < expDiff ) goto aExpBigger;
 2716     if ( expDiff < 0 ) goto bExpBigger;
 2717     if ( aExp == 0x7FF ) {
 2718         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
 2719         float_raise( float_flag_invalid );
 2720         return float64_default_nan;
 2721     }
 2722     if ( aExp == 0 ) {
 2723         aExp = 1;
 2724         bExp = 1;
 2725     }
 2726     if ( bSig < aSig ) goto aBigger;
 2727     if ( aSig < bSig ) goto bBigger;
 2728     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
 2729  bExpBigger:
 2730     if ( bExp == 0x7FF ) {
 2731         if ( bSig ) return propagateFloat64NaN( a, b );
 2732         return packFloat64( zSign ^ 1, 0x7FF, 0 );
 2733     }
 2734     if ( aExp == 0 ) {
 2735         ++expDiff;
 2736     }
 2737     else {
 2738         aSig |= LIT64( 0x4000000000000000 );
 2739     }
 2740     shift64RightJamming( aSig, - expDiff, &aSig );
 2741     bSig |= LIT64( 0x4000000000000000 );
 2742  bBigger:
 2743     zSig = bSig - aSig;
 2744     zExp = bExp;
 2745     zSign ^= 1;
 2746     goto normalizeRoundAndPack;
 2747  aExpBigger:
 2748     if ( aExp == 0x7FF ) {
 2749         if ( aSig ) return propagateFloat64NaN( a, b );
 2750         return a;
 2751     }
 2752     if ( bExp == 0 ) {
 2753         --expDiff;
 2754     }
 2755     else {
 2756         bSig |= LIT64( 0x4000000000000000 );
 2757     }
 2758     shift64RightJamming( bSig, expDiff, &bSig );
 2759     aSig |= LIT64( 0x4000000000000000 );
 2760  aBigger:
 2761     zSig = aSig - bSig;
 2762     zExp = aExp;
 2763  normalizeRoundAndPack:
 2764     --zExp;
 2765     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
 2766 
 2767 }
 2768 
 2769 /*
 2770 -------------------------------------------------------------------------------
 2771 Returns the result of adding the double-precision floating-point values `a'
 2772 and `b'.  The operation is performed according to the IEC/IEEE Standard for
 2773 Binary Floating-Point Arithmetic.
 2774 -------------------------------------------------------------------------------
 2775 */
 2776 float64 float64_add( float64 a, float64 b )
 2777 {
 2778     flag aSign, bSign;
 2779 
 2780     aSign = extractFloat64Sign( a );
 2781     bSign = extractFloat64Sign( b );
 2782     if ( aSign == bSign ) {
 2783         return addFloat64Sigs( a, b, aSign );
 2784     }
 2785     else {
 2786         return subFloat64Sigs( a, b, aSign );
 2787     }
 2788 
 2789 }
 2790 
 2791 /*
 2792 -------------------------------------------------------------------------------
 2793 Returns the result of subtracting the double-precision floating-point values
 2794 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
 2795 for Binary Floating-Point Arithmetic.
 2796 -------------------------------------------------------------------------------
 2797 */
 2798 float64 float64_sub( float64 a, float64 b )
 2799 {
 2800     flag aSign, bSign;
 2801 
 2802     aSign = extractFloat64Sign( a );
 2803     bSign = extractFloat64Sign( b );
 2804     if ( aSign == bSign ) {
 2805         return subFloat64Sigs( a, b, aSign );
 2806     }
 2807     else {
 2808         return addFloat64Sigs( a, b, aSign );
 2809     }
 2810 
 2811 }
 2812 
 2813 /*
 2814 -------------------------------------------------------------------------------
 2815 Returns the result of multiplying the double-precision floating-point values
 2816 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
 2817 for Binary Floating-Point Arithmetic.
 2818 -------------------------------------------------------------------------------
 2819 */
 2820 float64 float64_mul( float64 a, float64 b )
 2821 {
 2822     flag aSign, bSign, zSign;
 2823     int16 aExp, bExp, zExp;
 2824     bits64 aSig, bSig, zSig0, zSig1;
 2825 
 2826     aSig = extractFloat64Frac( a );
 2827     aExp = extractFloat64Exp( a );
 2828     aSign = extractFloat64Sign( a );
 2829     bSig = extractFloat64Frac( b );
 2830     bExp = extractFloat64Exp( b );
 2831     bSign = extractFloat64Sign( b );
 2832     zSign = aSign ^ bSign;
 2833     if ( aExp == 0x7FF ) {
 2834         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
 2835             return propagateFloat64NaN( a, b );
 2836         }
 2837         if ( ( bExp | bSig ) == 0 ) {
 2838             float_raise( float_flag_invalid );
 2839             return float64_default_nan;
 2840         }
 2841         return packFloat64( zSign, 0x7FF, 0 );
 2842     }
 2843     if ( bExp == 0x7FF ) {
 2844         if ( bSig ) return propagateFloat64NaN( a, b );
 2845         if ( ( aExp | aSig ) == 0 ) {
 2846             float_raise( float_flag_invalid );
 2847             return float64_default_nan;
 2848         }
 2849         return packFloat64( zSign, 0x7FF, 0 );
 2850     }
 2851     if ( aExp == 0 ) {
 2852         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
 2853         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 2854     }
 2855     if ( bExp == 0 ) {
 2856         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
 2857         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
 2858     }
 2859     zExp = aExp + bExp - 0x3FF;
 2860     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
 2861     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
 2862     mul64To128( aSig, bSig, &zSig0, &zSig1 );
 2863     zSig0 |= ( zSig1 != 0 );
 2864     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
 2865         zSig0 <<= 1;
 2866         --zExp;
 2867     }
 2868     return roundAndPackFloat64( zSign, zExp, zSig0 );
 2869 
 2870 }
 2871 
 2872 /*
 2873 -------------------------------------------------------------------------------
 2874 Returns the result of dividing the double-precision floating-point value `a'
 2875 by the corresponding value `b'.  The operation is performed according to
 2876 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 2877 -------------------------------------------------------------------------------
 2878 */
 2879 float64 float64_div( float64 a, float64 b )
 2880 {
 2881     flag aSign, bSign, zSign;
 2882     int16 aExp, bExp, zExp;
 2883     bits64 aSig, bSig, zSig;
 2884     bits64 rem0, rem1;
 2885     bits64 term0, term1;
 2886 
 2887     aSig = extractFloat64Frac( a );
 2888     aExp = extractFloat64Exp( a );
 2889     aSign = extractFloat64Sign( a );
 2890     bSig = extractFloat64Frac( b );
 2891     bExp = extractFloat64Exp( b );
 2892     bSign = extractFloat64Sign( b );
 2893     zSign = aSign ^ bSign;
 2894     if ( aExp == 0x7FF ) {
 2895         if ( aSig ) return propagateFloat64NaN( a, b );
 2896         if ( bExp == 0x7FF ) {
 2897             if ( bSig ) return propagateFloat64NaN( a, b );
 2898             float_raise( float_flag_invalid );
 2899             return float64_default_nan;
 2900         }
 2901         return packFloat64( zSign, 0x7FF, 0 );
 2902     }
 2903     if ( bExp == 0x7FF ) {
 2904         if ( bSig ) return propagateFloat64NaN( a, b );
 2905         return packFloat64( zSign, 0, 0 );
 2906     }
 2907     if ( bExp == 0 ) {
 2908         if ( bSig == 0 ) {
 2909             if ( ( aExp | aSig ) == 0 ) {
 2910                 float_raise( float_flag_invalid );
 2911                 return float64_default_nan;
 2912             }
 2913             float_raise( float_flag_divbyzero );
 2914             return packFloat64( zSign, 0x7FF, 0 );
 2915         }
 2916         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
 2917     }
 2918     if ( aExp == 0 ) {
 2919         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
 2920         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 2921     }
 2922     zExp = aExp - bExp + 0x3FD;
 2923     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
 2924     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
 2925     if ( bSig <= ( aSig + aSig ) ) {
 2926         aSig >>= 1;
 2927         ++zExp;
 2928     }
 2929     zSig = estimateDiv128To64( aSig, 0, bSig );
 2930     if ( ( zSig & 0x1FF ) <= 2 ) {
 2931         mul64To128( bSig, zSig, &term0, &term1 );
 2932         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
 2933         while ( (sbits64) rem0 < 0 ) {
 2934             --zSig;
 2935             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
 2936         }
 2937         zSig |= ( rem1 != 0 );
 2938     }
 2939     return roundAndPackFloat64( zSign, zExp, zSig );
 2940 
 2941 }
 2942 
 2943 #ifndef SOFTFLOAT_FOR_GCC
 2944 /*
 2945 -------------------------------------------------------------------------------
 2946 Returns the remainder of the double-precision floating-point value `a'
 2947 with respect to the corresponding value `b'.  The operation is performed
 2948 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 2949 -------------------------------------------------------------------------------
 2950 */
 2951 float64 float64_rem( float64 a, float64 b )
 2952 {
 2953     flag aSign, bSign, zSign;
 2954     int16 aExp, bExp, expDiff;
 2955     bits64 aSig, bSig;
 2956     bits64 q, alternateASig;
 2957     sbits64 sigMean;
 2958 
 2959     aSig = extractFloat64Frac( a );
 2960     aExp = extractFloat64Exp( a );
 2961     aSign = extractFloat64Sign( a );
 2962     bSig = extractFloat64Frac( b );
 2963     bExp = extractFloat64Exp( b );
 2964     bSign = extractFloat64Sign( b );
 2965     if ( aExp == 0x7FF ) {
 2966         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
 2967             return propagateFloat64NaN( a, b );
 2968         }
 2969         float_raise( float_flag_invalid );
 2970         return float64_default_nan;
 2971     }
 2972     if ( bExp == 0x7FF ) {
 2973         if ( bSig ) return propagateFloat64NaN( a, b );
 2974         return a;
 2975     }
 2976     if ( bExp == 0 ) {
 2977         if ( bSig == 0 ) {
 2978             float_raise( float_flag_invalid );
 2979             return float64_default_nan;
 2980         }
 2981         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
 2982     }
 2983     if ( aExp == 0 ) {
 2984         if ( aSig == 0 ) return a;
 2985         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 2986     }
 2987     expDiff = aExp - bExp;
 2988     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
 2989     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
 2990     if ( expDiff < 0 ) {
 2991         if ( expDiff < -1 ) return a;
 2992         aSig >>= 1;
 2993     }
 2994     q = ( bSig <= aSig );
 2995     if ( q ) aSig -= bSig;
 2996     expDiff -= 64;
 2997     while ( 0 < expDiff ) {
 2998         q = estimateDiv128To64( aSig, 0, bSig );
 2999         q = ( 2 < q ) ? q - 2 : 0;
 3000         aSig = - ( ( bSig>>2 ) * q );
 3001         expDiff -= 62;
 3002     }
 3003     expDiff += 64;
 3004     if ( 0 < expDiff ) {
 3005         q = estimateDiv128To64( aSig, 0, bSig );
 3006         q = ( 2 < q ) ? q - 2 : 0;
 3007         q >>= 64 - expDiff;
 3008         bSig >>= 2;
 3009         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
 3010     }
 3011     else {
 3012         aSig >>= 2;
 3013         bSig >>= 2;
 3014     }
 3015     do {
 3016         alternateASig = aSig;
 3017         ++q;
 3018         aSig -= bSig;
 3019     } while ( 0 <= (sbits64) aSig );
 3020     sigMean = aSig + alternateASig;
 3021     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
 3022         aSig = alternateASig;
 3023     }
 3024     zSign = ( (sbits64) aSig < 0 );
 3025     if ( zSign ) aSig = - aSig;
 3026     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
 3027 
 3028 }
 3029 
 3030 /*
 3031 -------------------------------------------------------------------------------
 3032 Returns the square root of the double-precision floating-point value `a'.
 3033 The operation is performed according to the IEC/IEEE Standard for Binary
 3034 Floating-Point Arithmetic.
 3035 -------------------------------------------------------------------------------
 3036 */
 3037 float64 float64_sqrt( float64 a )
 3038 {
 3039     flag aSign;
 3040     int16 aExp, zExp;
 3041     bits64 aSig, zSig, doubleZSig;
 3042     bits64 rem0, rem1, term0, term1;
 3043 
 3044     aSig = extractFloat64Frac( a );
 3045     aExp = extractFloat64Exp( a );
 3046     aSign = extractFloat64Sign( a );
 3047     if ( aExp == 0x7FF ) {
 3048         if ( aSig ) return propagateFloat64NaN( a, a );
 3049         if ( ! aSign ) return a;
 3050         float_raise( float_flag_invalid );
 3051         return float64_default_nan;
 3052     }
 3053     if ( aSign ) {
 3054         if ( ( aExp | aSig ) == 0 ) return a;
 3055         float_raise( float_flag_invalid );
 3056         return float64_default_nan;
 3057     }
 3058     if ( aExp == 0 ) {
 3059         if ( aSig == 0 ) return 0;
 3060         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 3061     }
 3062     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
 3063     aSig |= LIT64( 0x0010000000000000 );
 3064     zSig = estimateSqrt32( aExp, aSig>>21 );
 3065     aSig <<= 9 - ( aExp & 1 );
 3066     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
 3067     if ( ( zSig & 0x1FF ) <= 5 ) {
 3068         doubleZSig = zSig<<1;
 3069         mul64To128( zSig, zSig, &term0, &term1 );
 3070         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
 3071         while ( (sbits64) rem0 < 0 ) {
 3072             --zSig;
 3073             doubleZSig -= 2;
 3074             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
 3075         }
 3076         zSig |= ( ( rem0 | rem1 ) != 0 );
 3077     }
 3078     return roundAndPackFloat64( 0, zExp, zSig );
 3079 
 3080 }
 3081 #endif
 3082 
 3083 /*
 3084 -------------------------------------------------------------------------------
 3085 Returns 1 if the double-precision floating-point value `a' is equal to the
 3086 corresponding value `b', and 0 otherwise.  The comparison is performed
 3087 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3088 -------------------------------------------------------------------------------
 3089 */
 3090 flag float64_eq( float64 a, float64 b )
 3091 {
 3092 
 3093     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
 3094          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
 3095        ) {
 3096         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
 3097             float_raise( float_flag_invalid );
 3098         }
 3099         return 0;
 3100     }
 3101     return ( a == b ) ||
 3102         ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
 3103 
 3104 }
 3105 
 3106 /*
 3107 -------------------------------------------------------------------------------
 3108 Returns 1 if the double-precision floating-point value `a' is less than or
 3109 equal to the corresponding value `b', and 0 otherwise.  The comparison is
 3110 performed according to the IEC/IEEE Standard for Binary Floating-Point
 3111 Arithmetic.
 3112 -------------------------------------------------------------------------------
 3113 */
 3114 flag float64_le( float64 a, float64 b )
 3115 {
 3116     flag aSign, bSign;
 3117 
 3118     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
 3119          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
 3120        ) {
 3121         float_raise( float_flag_invalid );
 3122         return 0;
 3123     }
 3124     aSign = extractFloat64Sign( a );
 3125     bSign = extractFloat64Sign( b );
 3126     if ( aSign != bSign )
 3127         return aSign ||
 3128             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
 3129               0 );
 3130     return ( a == b ) ||
 3131         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
 3132 
 3133 }
 3134 
 3135 /*
 3136 -------------------------------------------------------------------------------
 3137 Returns 1 if the double-precision floating-point value `a' is less than
 3138 the corresponding value `b', and 0 otherwise.  The comparison is performed
 3139 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3140 -------------------------------------------------------------------------------
 3141 */
 3142 flag float64_lt( float64 a, float64 b )
 3143 {
 3144     flag aSign, bSign;
 3145 
 3146     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
 3147          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
 3148        ) {
 3149         float_raise( float_flag_invalid );
 3150         return 0;
 3151     }
 3152     aSign = extractFloat64Sign( a );
 3153     bSign = extractFloat64Sign( b );
 3154     if ( aSign != bSign )
 3155         return aSign &&
 3156             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
 3157               0 );
 3158     return ( a != b ) &&
 3159         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
 3160 
 3161 }
 3162 
 3163 #ifndef SOFTFLOAT_FOR_GCC
 3164 /*
 3165 -------------------------------------------------------------------------------
 3166 Returns 1 if the double-precision floating-point value `a' is equal to the
 3167 corresponding value `b', and 0 otherwise.  The invalid exception is raised
 3168 if either operand is a NaN.  Otherwise, the comparison is performed
 3169 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3170 -------------------------------------------------------------------------------
 3171 */
 3172 flag float64_eq_signaling( float64 a, float64 b )
 3173 {
 3174 
 3175     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
 3176          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
 3177        ) {
 3178         float_raise( float_flag_invalid );
 3179         return 0;
 3180     }
 3181     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
 3182 
 3183 }
 3184 
 3185 /*
 3186 -------------------------------------------------------------------------------
 3187 Returns 1 if the double-precision floating-point value `a' is less than or
 3188 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
 3189 cause an exception.  Otherwise, the comparison is performed according to the
 3190 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3191 -------------------------------------------------------------------------------
 3192 */
 3193 flag float64_le_quiet( float64 a, float64 b )
 3194 {
 3195     flag aSign, bSign;
 3196 
 3197     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
 3198          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
 3199        ) {
 3200         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
 3201             float_raise( float_flag_invalid );
 3202         }
 3203         return 0;
 3204     }
 3205     aSign = extractFloat64Sign( a );
 3206     bSign = extractFloat64Sign( b );
 3207     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
 3208     return ( a == b ) || ( aSign ^ ( a < b ) );
 3209 
 3210 }
 3211 
 3212 /*
 3213 -------------------------------------------------------------------------------
 3214 Returns 1 if the double-precision floating-point value `a' is less than
 3215 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
 3216 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
 3217 Standard for Binary Floating-Point Arithmetic.
 3218 -------------------------------------------------------------------------------
 3219 */
 3220 flag float64_lt_quiet( float64 a, float64 b )
 3221 {
 3222     flag aSign, bSign;
 3223 
 3224     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
 3225          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
 3226        ) {
 3227         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
 3228             float_raise( float_flag_invalid );
 3229         }
 3230         return 0;
 3231     }
 3232     aSign = extractFloat64Sign( a );
 3233     bSign = extractFloat64Sign( b );
 3234     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
 3235     return ( a != b ) && ( aSign ^ ( a < b ) );
 3236 
 3237 }
 3238 #endif
 3239 
 3240 #ifdef FLOATX80
 3241 
 3242 /*
 3243 -------------------------------------------------------------------------------
 3244 Returns the result of converting the extended double-precision floating-
 3245 point value `a' to the 32-bit two's complement integer format.  The
 3246 conversion is performed according to the IEC/IEEE Standard for Binary
 3247 Floating-Point Arithmetic---which means in particular that the conversion
 3248 is rounded according to the current rounding mode.  If `a' is a NaN, the
 3249 largest positive integer is returned.  Otherwise, if the conversion
 3250 overflows, the largest integer with the same sign as `a' is returned.
 3251 -------------------------------------------------------------------------------
 3252 */
 3253 int32 floatx80_to_int32( floatx80 a )
 3254 {
 3255     flag aSign;
 3256     int32 aExp, shiftCount;
 3257     bits64 aSig;
 3258 
 3259     aSig = extractFloatx80Frac( a );
 3260     aExp = extractFloatx80Exp( a );
 3261     aSign = extractFloatx80Sign( a );
 3262     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
 3263     shiftCount = 0x4037 - aExp;
 3264     if ( shiftCount <= 0 ) shiftCount = 1;
 3265     shift64RightJamming( aSig, shiftCount, &aSig );
 3266     return roundAndPackInt32( aSign, aSig );
 3267 
 3268 }
 3269 
 3270 /*
 3271 -------------------------------------------------------------------------------
 3272 Returns the result of converting the extended double-precision floating-
 3273 point value `a' to the 32-bit two's complement integer format.  The
 3274 conversion is performed according to the IEC/IEEE Standard for Binary
 3275 Floating-Point Arithmetic, except that the conversion is always rounded
 3276 toward zero.  If `a' is a NaN, the largest positive integer is returned.
 3277 Otherwise, if the conversion overflows, the largest integer with the same
 3278 sign as `a' is returned.
 3279 -------------------------------------------------------------------------------
 3280 */
 3281 int32 floatx80_to_int32_round_to_zero( floatx80 a )
 3282 {
 3283     flag aSign;
 3284     int32 aExp, shiftCount;
 3285     bits64 aSig, savedASig;
 3286     int32 z;
 3287 
 3288     aSig = extractFloatx80Frac( a );
 3289     aExp = extractFloatx80Exp( a );
 3290     aSign = extractFloatx80Sign( a );
 3291     if ( 0x401E < aExp ) {
 3292         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
 3293         goto invalid;
 3294     }
 3295     else if ( aExp < 0x3FFF ) {
 3296         if ( aExp || aSig ) float_set_inexact();
 3297         return 0;
 3298     }
 3299     shiftCount = 0x403E - aExp;
 3300     savedASig = aSig;
 3301     aSig >>= shiftCount;
 3302     z = aSig;
 3303     if ( aSign ) z = - z;
 3304     if ( ( z < 0 ) ^ aSign ) {
 3305  invalid:
 3306         float_raise( float_flag_invalid );
 3307         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 3308     }
 3309     if ( ( aSig<<shiftCount ) != savedASig ) {
 3310         float_set_inexact();
 3311     }
 3312     return z;
 3313 
 3314 }
 3315 
 3316 /*
 3317 -------------------------------------------------------------------------------
 3318 Returns the result of converting the extended double-precision floating-
 3319 point value `a' to the 64-bit two's complement integer format.  The
 3320 conversion is performed according to the IEC/IEEE Standard for Binary
 3321 Floating-Point Arithmetic---which means in particular that the conversion
 3322 is rounded according to the current rounding mode.  If `a' is a NaN,
 3323 the largest positive integer is returned.  Otherwise, if the conversion
 3324 overflows, the largest integer with the same sign as `a' is returned.
 3325 -------------------------------------------------------------------------------
 3326 */
 3327 int64 floatx80_to_int64( floatx80 a )
 3328 {
 3329     flag aSign;
 3330     int32 aExp, shiftCount;
 3331     bits64 aSig, aSigExtra;
 3332 
 3333     aSig = extractFloatx80Frac( a );
 3334     aExp = extractFloatx80Exp( a );
 3335     aSign = extractFloatx80Sign( a );
 3336     shiftCount = 0x403E - aExp;
 3337     if ( shiftCount <= 0 ) {
 3338         if ( shiftCount ) {
 3339             float_raise( float_flag_invalid );
 3340             if (    ! aSign
 3341                  || (    ( aExp == 0x7FFF )
 3342                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
 3343                ) {
 3344                 return LIT64( 0x7FFFFFFFFFFFFFFF );
 3345             }
 3346             return (sbits64) LIT64( 0x8000000000000000 );
 3347         }
 3348         aSigExtra = 0;
 3349     }
 3350     else {
 3351         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
 3352     }
 3353     return roundAndPackInt64( aSign, aSig, aSigExtra );
 3354 
 3355 }
 3356 
 3357 /*
 3358 -------------------------------------------------------------------------------
 3359 Returns the result of converting the extended double-precision floating-
 3360 point value `a' to the 64-bit two's complement integer format.  The
 3361 conversion is performed according to the IEC/IEEE Standard for Binary
 3362 Floating-Point Arithmetic, except that the conversion is always rounded
 3363 toward zero.  If `a' is a NaN, the largest positive integer is returned.
 3364 Otherwise, if the conversion overflows, the largest integer with the same
 3365 sign as `a' is returned.
 3366 -------------------------------------------------------------------------------
 3367 */
 3368 int64 floatx80_to_int64_round_to_zero( floatx80 a )
 3369 {
 3370     flag aSign;
 3371     int32 aExp, shiftCount;
 3372     bits64 aSig;
 3373     int64 z;
 3374 
 3375     aSig = extractFloatx80Frac( a );
 3376     aExp = extractFloatx80Exp( a );
 3377     aSign = extractFloatx80Sign( a );
 3378     shiftCount = aExp - 0x403E;
 3379     if ( 0 <= shiftCount ) {
 3380         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
 3381         if ( ( a.high != 0xC03E ) || aSig ) {
 3382             float_raise( float_flag_invalid );
 3383             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
 3384                 return LIT64( 0x7FFFFFFFFFFFFFFF );
 3385             }
 3386         }
 3387         return (sbits64) LIT64( 0x8000000000000000 );
 3388     }
 3389     else if ( aExp < 0x3FFF ) {
 3390         if ( aExp | aSig ) float_set_inexact();
 3391         return 0;
 3392     }
 3393     z = aSig>>( - shiftCount );
 3394     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
 3395         float_set_inexact();
 3396     }
 3397     if ( aSign ) z = - z;
 3398     return z;
 3399 
 3400 }
 3401 
 3402 /*
 3403 -------------------------------------------------------------------------------
 3404 Returns the result of converting the extended double-precision floating-
 3405 point value `a' to the single-precision floating-point format.  The
 3406 conversion is performed according to the IEC/IEEE Standard for Binary
 3407 Floating-Point Arithmetic.
 3408 -------------------------------------------------------------------------------
 3409 */
 3410 float32 floatx80_to_float32( floatx80 a )
 3411 {
 3412     flag aSign;
 3413     int32 aExp;
 3414     bits64 aSig;
 3415 
 3416     aSig = extractFloatx80Frac( a );
 3417     aExp = extractFloatx80Exp( a );
 3418     aSign = extractFloatx80Sign( a );
 3419     if ( aExp == 0x7FFF ) {
 3420         if ( (bits64) ( aSig<<1 ) ) {
 3421             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
 3422         }
 3423         return packFloat32( aSign, 0xFF, 0 );
 3424     }
 3425     shift64RightJamming( aSig, 33, &aSig );
 3426     if ( aExp || aSig ) aExp -= 0x3F81;
 3427     return roundAndPackFloat32( aSign, aExp, aSig );
 3428 
 3429 }
 3430 
 3431 /*
 3432 -------------------------------------------------------------------------------
 3433 Returns the result of converting the extended double-precision floating-
 3434 point value `a' to the double-precision floating-point format.  The
 3435 conversion is performed according to the IEC/IEEE Standard for Binary
 3436 Floating-Point Arithmetic.
 3437 -------------------------------------------------------------------------------
 3438 */
 3439 float64 floatx80_to_float64( floatx80 a )
 3440 {
 3441     flag aSign;
 3442     int32 aExp;
 3443     bits64 aSig, zSig;
 3444 
 3445     aSig = extractFloatx80Frac( a );
 3446     aExp = extractFloatx80Exp( a );
 3447     aSign = extractFloatx80Sign( a );
 3448     if ( aExp == 0x7FFF ) {
 3449         if ( (bits64) ( aSig<<1 ) ) {
 3450             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
 3451         }
 3452         return packFloat64( aSign, 0x7FF, 0 );
 3453     }
 3454     shift64RightJamming( aSig, 1, &zSig );
 3455     if ( aExp || aSig ) aExp -= 0x3C01;
 3456     return roundAndPackFloat64( aSign, aExp, zSig );
 3457 
 3458 }
 3459 
 3460 #ifdef FLOAT128
 3461 
 3462 /*
 3463 -------------------------------------------------------------------------------
 3464 Returns the result of converting the extended double-precision floating-
 3465 point value `a' to the quadruple-precision floating-point format.  The
 3466 conversion is performed according to the IEC/IEEE Standard for Binary
 3467 Floating-Point Arithmetic.
 3468 -------------------------------------------------------------------------------
 3469 */
 3470 float128 floatx80_to_float128( floatx80 a )
 3471 {
 3472     flag aSign;
 3473     int16 aExp;
 3474     bits64 aSig, zSig0, zSig1;
 3475 
 3476     aSig = extractFloatx80Frac( a );
 3477     aExp = extractFloatx80Exp( a );
 3478     aSign = extractFloatx80Sign( a );
 3479     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
 3480         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
 3481     }
 3482     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
 3483     return packFloat128( aSign, aExp, zSig0, zSig1 );
 3484 
 3485 }
 3486 
 3487 #endif
 3488 
 3489 /*
 3490 -------------------------------------------------------------------------------
 3491 Rounds the extended double-precision floating-point value `a' to an integer,
 3492 and returns the result as an extended quadruple-precision floating-point
 3493 value.  The operation is performed according to the IEC/IEEE Standard for
 3494 Binary Floating-Point Arithmetic.
 3495 -------------------------------------------------------------------------------
 3496 */
 3497 floatx80 floatx80_round_to_int( floatx80 a )
 3498 {
 3499     flag aSign;
 3500     int32 aExp;
 3501     bits64 lastBitMask, roundBitsMask;
 3502     int8 roundingMode;
 3503     floatx80 z;
 3504 
 3505     aExp = extractFloatx80Exp( a );
 3506     if ( 0x403E <= aExp ) {
 3507         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
 3508             return propagateFloatx80NaN( a, a );
 3509         }
 3510         return a;
 3511     }
 3512     if ( aExp < 0x3FFF ) {
 3513         if (    ( aExp == 0 )
 3514              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
 3515             return a;
 3516         }
 3517         float_set_inexact();
 3518         aSign = extractFloatx80Sign( a );
 3519         switch ( float_rounding_mode() ) {
 3520          case float_round_nearest_even:
 3521             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
 3522                ) {
 3523                 return
 3524                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
 3525             }
 3526             break;
 3527          case float_round_down:
 3528             return
 3529                   aSign ?
 3530                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
 3531                 : packFloatx80( 0, 0, 0 );
 3532          case float_round_up:
 3533             return
 3534                   aSign ? packFloatx80( 1, 0, 0 )
 3535                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
 3536         }
 3537         return packFloatx80( aSign, 0, 0 );
 3538     }
 3539     lastBitMask = 1;
 3540     lastBitMask <<= 0x403E - aExp;
 3541     roundBitsMask = lastBitMask - 1;
 3542     z = a;
 3543     roundingMode = float_rounding_mode();
 3544     if ( roundingMode == float_round_nearest_even ) {
 3545         z.low += lastBitMask>>1;
 3546         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
 3547     }
 3548     else if ( roundingMode != float_round_to_zero ) {
 3549         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
 3550             z.low += roundBitsMask;
 3551         }
 3552     }
 3553     z.low &= ~ roundBitsMask;
 3554     if ( z.low == 0 ) {
 3555         ++z.high;
 3556         z.low = LIT64( 0x8000000000000000 );
 3557     }
 3558     if ( z.low != a.low ) float_set_inexact();
 3559     return z;
 3560 
 3561 }
 3562 
 3563 /*
 3564 -------------------------------------------------------------------------------
 3565 Returns the result of adding the absolute values of the extended double-
 3566 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
 3567 negated before being returned.  `zSign' is ignored if the result is a NaN.
 3568 The addition is performed according to the IEC/IEEE Standard for Binary
 3569 Floating-Point Arithmetic.
 3570 -------------------------------------------------------------------------------
 3571 */
 3572 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
 3573 {
 3574     int32 aExp, bExp, zExp;
 3575     bits64 aSig, bSig, zSig0, zSig1;
 3576     int32 expDiff;
 3577 
 3578     aSig = extractFloatx80Frac( a );
 3579     aExp = extractFloatx80Exp( a );
 3580     bSig = extractFloatx80Frac( b );
 3581     bExp = extractFloatx80Exp( b );
 3582     expDiff = aExp - bExp;
 3583     if ( 0 < expDiff ) {
 3584         if ( aExp == 0x7FFF ) {
 3585             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3586             return a;
 3587         }
 3588         if ( bExp == 0 ) --expDiff;
 3589         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
 3590         zExp = aExp;
 3591     }
 3592     else if ( expDiff < 0 ) {
 3593         if ( bExp == 0x7FFF ) {
 3594             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3595             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 3596         }
 3597         if ( aExp == 0 ) ++expDiff;
 3598         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
 3599         zExp = bExp;
 3600     }
 3601     else {
 3602         if ( aExp == 0x7FFF ) {
 3603             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
 3604                 return propagateFloatx80NaN( a, b );
 3605             }
 3606             return a;
 3607         }
 3608         zSig1 = 0;
 3609         zSig0 = aSig + bSig;
 3610         if ( aExp == 0 ) {
 3611             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
 3612             goto roundAndPack;
 3613         }
 3614         zExp = aExp;
 3615         goto shiftRight1;
 3616     }
 3617     zSig0 = aSig + bSig;
 3618     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
 3619  shiftRight1:
 3620     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
 3621     zSig0 |= LIT64( 0x8000000000000000 );
 3622     ++zExp;
 3623  roundAndPack:
 3624     return
 3625         roundAndPackFloatx80(
 3626             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
 3627 
 3628 }
 3629 
 3630 /*
 3631 -------------------------------------------------------------------------------
 3632 Returns the result of subtracting the absolute values of the extended
 3633 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
 3634 difference is negated before being returned.  `zSign' is ignored if the
 3635 result is a NaN.  The subtraction is performed according to the IEC/IEEE
 3636 Standard for Binary Floating-Point Arithmetic.
 3637 -------------------------------------------------------------------------------
 3638 */
 3639 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
 3640 {
 3641     int32 aExp, bExp, zExp;
 3642     bits64 aSig, bSig, zSig0, zSig1;
 3643     int32 expDiff;
 3644     floatx80 z;
 3645 
 3646     aSig = extractFloatx80Frac( a );
 3647     aExp = extractFloatx80Exp( a );
 3648     bSig = extractFloatx80Frac( b );
 3649     bExp = extractFloatx80Exp( b );
 3650     expDiff = aExp - bExp;
 3651     if ( 0 < expDiff ) goto aExpBigger;
 3652     if ( expDiff < 0 ) goto bExpBigger;
 3653     if ( aExp == 0x7FFF ) {
 3654         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
 3655             return propagateFloatx80NaN( a, b );
 3656         }
 3657         float_raise( float_flag_invalid );
 3658         z.low = floatx80_default_nan_low;
 3659         z.high = floatx80_default_nan_high;
 3660         return z;
 3661     }
 3662     if ( aExp == 0 ) {
 3663         aExp = 1;
 3664         bExp = 1;
 3665     }
 3666     zSig1 = 0;
 3667     if ( bSig < aSig ) goto aBigger;
 3668     if ( aSig < bSig ) goto bBigger;
 3669     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
 3670  bExpBigger:
 3671     if ( bExp == 0x7FFF ) {
 3672         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3673         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
 3674     }
 3675     if ( aExp == 0 ) ++expDiff;
 3676     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
 3677  bBigger:
 3678     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
 3679     zExp = bExp;
 3680     zSign ^= 1;
 3681     goto normalizeRoundAndPack;
 3682  aExpBigger:
 3683     if ( aExp == 0x7FFF ) {
 3684         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3685         return a;
 3686     }
 3687     if ( bExp == 0 ) --expDiff;
 3688     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
 3689  aBigger:
 3690     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
 3691     zExp = aExp;
 3692  normalizeRoundAndPack:
 3693     return
 3694         normalizeRoundAndPackFloatx80(
 3695             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
 3696 
 3697 }
 3698 
 3699 /*
 3700 -------------------------------------------------------------------------------
 3701 Returns the result of adding the extended double-precision floating-point
 3702 values `a' and `b'.  The operation is performed according to the IEC/IEEE
 3703 Standard for Binary Floating-Point Arithmetic.
 3704 -------------------------------------------------------------------------------
 3705 */
 3706 floatx80 floatx80_add( floatx80 a, floatx80 b )
 3707 {
 3708     flag aSign, bSign;
 3709 
 3710     aSign = extractFloatx80Sign( a );
 3711     bSign = extractFloatx80Sign( b );
 3712     if ( aSign == bSign ) {
 3713         return addFloatx80Sigs( a, b, aSign );
 3714     }
 3715     else {
 3716         return subFloatx80Sigs( a, b, aSign );
 3717     }
 3718 
 3719 }
 3720 
 3721 /*
 3722 -------------------------------------------------------------------------------
 3723 Returns the result of subtracting the extended double-precision floating-
 3724 point values `a' and `b'.  The operation is performed according to the
 3725 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3726 -------------------------------------------------------------------------------
 3727 */
 3728 floatx80 floatx80_sub( floatx80 a, floatx80 b )
 3729 {
 3730     flag aSign, bSign;
 3731 
 3732     aSign = extractFloatx80Sign( a );
 3733     bSign = extractFloatx80Sign( b );
 3734     if ( aSign == bSign ) {
 3735         return subFloatx80Sigs( a, b, aSign );
 3736     }
 3737     else {
 3738         return addFloatx80Sigs( a, b, aSign );
 3739     }
 3740 
 3741 }
 3742 
 3743 /*
 3744 -------------------------------------------------------------------------------
 3745 Returns the result of multiplying the extended double-precision floating-
 3746 point values `a' and `b'.  The operation is performed according to the
 3747 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3748 -------------------------------------------------------------------------------
 3749 */
 3750 floatx80 floatx80_mul( floatx80 a, floatx80 b )
 3751 {
 3752     flag aSign, bSign, zSign;
 3753     int32 aExp, bExp, zExp;
 3754     bits64 aSig, bSig, zSig0, zSig1;
 3755     floatx80 z;
 3756 
 3757     aSig = extractFloatx80Frac( a );
 3758     aExp = extractFloatx80Exp( a );
 3759     aSign = extractFloatx80Sign( a );
 3760     bSig = extractFloatx80Frac( b );
 3761     bExp = extractFloatx80Exp( b );
 3762     bSign = extractFloatx80Sign( b );
 3763     zSign = aSign ^ bSign;
 3764     if ( aExp == 0x7FFF ) {
 3765         if (    (bits64) ( aSig<<1 )
 3766              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
 3767             return propagateFloatx80NaN( a, b );
 3768         }
 3769         if ( ( bExp | bSig ) == 0 ) goto invalid;
 3770         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 3771     }
 3772     if ( bExp == 0x7FFF ) {
 3773         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3774         if ( ( aExp | aSig ) == 0 ) {
 3775  invalid:
 3776             float_raise( float_flag_invalid );
 3777             z.low = floatx80_default_nan_low;
 3778             z.high = floatx80_default_nan_high;
 3779             return z;
 3780         }
 3781         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 3782     }
 3783     if ( aExp == 0 ) {
 3784         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
 3785         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
 3786     }
 3787     if ( bExp == 0 ) {
 3788         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
 3789         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
 3790     }
 3791     zExp = aExp + bExp - 0x3FFE;
 3792     mul64To128( aSig, bSig, &zSig0, &zSig1 );
 3793     if ( 0 < (sbits64) zSig0 ) {
 3794         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
 3795         --zExp;
 3796     }
 3797     return
 3798         roundAndPackFloatx80(
 3799             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
 3800 
 3801 }
 3802 
 3803 /*
 3804 -------------------------------------------------------------------------------
 3805 Returns the result of dividing the extended double-precision floating-point
 3806 value `a' by the corresponding value `b'.  The operation is performed
 3807 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3808 -------------------------------------------------------------------------------
 3809 */
 3810 floatx80 floatx80_div( floatx80 a, floatx80 b )
 3811 {
 3812     flag aSign, bSign, zSign;
 3813     int32 aExp, bExp, zExp;
 3814     bits64 aSig, bSig, zSig0, zSig1;
 3815     bits64 rem0, rem1, rem2, term0, term1, term2;
 3816     floatx80 z;
 3817 
 3818     aSig = extractFloatx80Frac( a );
 3819     aExp = extractFloatx80Exp( a );
 3820     aSign = extractFloatx80Sign( a );
 3821     bSig = extractFloatx80Frac( b );
 3822     bExp = extractFloatx80Exp( b );
 3823     bSign = extractFloatx80Sign( b );
 3824     zSign = aSign ^ bSign;
 3825     if ( aExp == 0x7FFF ) {
 3826         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3827         if ( bExp == 0x7FFF ) {
 3828             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3829             goto invalid;
 3830         }
 3831         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 3832     }
 3833     if ( bExp == 0x7FFF ) {
 3834         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3835         return packFloatx80( zSign, 0, 0 );
 3836     }
 3837     if ( bExp == 0 ) {
 3838         if ( bSig == 0 ) {
 3839             if ( ( aExp | aSig ) == 0 ) {
 3840  invalid:
 3841                 float_raise( float_flag_invalid );
 3842                 z.low = floatx80_default_nan_low;
 3843                 z.high = floatx80_default_nan_high;
 3844                 return z;
 3845             }
 3846             float_raise( float_flag_divbyzero );
 3847             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 3848         }
 3849         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
 3850     }
 3851     if ( aExp == 0 ) {
 3852         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
 3853         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
 3854     }
 3855     zExp = aExp - bExp + 0x3FFE;
 3856     rem1 = 0;
 3857     if ( bSig <= aSig ) {
 3858         shift128Right( aSig, 0, 1, &aSig, &rem1 );
 3859         ++zExp;
 3860     }
 3861     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
 3862     mul64To128( bSig, zSig0, &term0, &term1 );
 3863     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
 3864     while ( (sbits64) rem0 < 0 ) {
 3865         --zSig0;
 3866         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
 3867     }
 3868     zSig1 = estimateDiv128To64( rem1, 0, bSig );
 3869     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
 3870         mul64To128( bSig, zSig1, &term1, &term2 );
 3871         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
 3872         while ( (sbits64) rem1 < 0 ) {
 3873             --zSig1;
 3874             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
 3875         }
 3876         zSig1 |= ( ( rem1 | rem2 ) != 0 );
 3877     }
 3878     return
 3879         roundAndPackFloatx80(
 3880             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
 3881 
 3882 }
 3883 
 3884 /*
 3885 -------------------------------------------------------------------------------
 3886 Returns the remainder of the extended double-precision floating-point value
 3887 `a' with respect to the corresponding value `b'.  The operation is performed
 3888 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 3889 -------------------------------------------------------------------------------
 3890 */
 3891 floatx80 floatx80_rem( floatx80 a, floatx80 b )
 3892 {
 3893     flag aSign, bSign, zSign;
 3894     int32 aExp, bExp, expDiff;
 3895     bits64 aSig0, aSig1, bSig;
 3896     bits64 q, term0, term1, alternateASig0, alternateASig1;
 3897     floatx80 z;
 3898 
 3899     aSig0 = extractFloatx80Frac( a );
 3900     aExp = extractFloatx80Exp( a );
 3901     aSign = extractFloatx80Sign( a );
 3902     bSig = extractFloatx80Frac( b );
 3903     bExp = extractFloatx80Exp( b );
 3904     bSign = extractFloatx80Sign( b );
 3905     if ( aExp == 0x7FFF ) {
 3906         if (    (bits64) ( aSig0<<1 )
 3907              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
 3908             return propagateFloatx80NaN( a, b );
 3909         }
 3910         goto invalid;
 3911     }
 3912     if ( bExp == 0x7FFF ) {
 3913         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
 3914         return a;
 3915     }
 3916     if ( bExp == 0 ) {
 3917         if ( bSig == 0 ) {
 3918  invalid:
 3919             float_raise( float_flag_invalid );
 3920             z.low = floatx80_default_nan_low;
 3921             z.high = floatx80_default_nan_high;
 3922             return z;
 3923         }
 3924         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
 3925     }
 3926     if ( aExp == 0 ) {
 3927         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
 3928         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
 3929     }
 3930     bSig |= LIT64( 0x8000000000000000 );
 3931     zSign = aSign;
 3932     expDiff = aExp - bExp;
 3933     aSig1 = 0;
 3934     if ( expDiff < 0 ) {
 3935         if ( expDiff < -1 ) return a;
 3936         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
 3937         expDiff = 0;
 3938     }
 3939     q = ( bSig <= aSig0 );
 3940     if ( q ) aSig0 -= bSig;
 3941     expDiff -= 64;
 3942     while ( 0 < expDiff ) {
 3943         q = estimateDiv128To64( aSig0, aSig1, bSig );
 3944         q = ( 2 < q ) ? q - 2 : 0;
 3945         mul64To128( bSig, q, &term0, &term1 );
 3946         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
 3947         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
 3948         expDiff -= 62;
 3949     }
 3950     expDiff += 64;
 3951     if ( 0 < expDiff ) {
 3952         q = estimateDiv128To64( aSig0, aSig1, bSig );
 3953         q = ( 2 < q ) ? q - 2 : 0;
 3954         q >>= 64 - expDiff;
 3955         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
 3956         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
 3957         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
 3958         while ( le128( term0, term1, aSig0, aSig1 ) ) {
 3959             ++q;
 3960             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
 3961         }
 3962     }
 3963     else {
 3964         term1 = 0;
 3965         term0 = bSig;
 3966     }
 3967     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
 3968     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
 3969          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
 3970               && ( q & 1 ) )
 3971        ) {
 3972         aSig0 = alternateASig0;
 3973         aSig1 = alternateASig1;
 3974         zSign = ! zSign;
 3975     }
 3976     return
 3977         normalizeRoundAndPackFloatx80(
 3978             80, zSign, bExp + expDiff, aSig0, aSig1 );
 3979 
 3980 }
 3981 
 3982 /*
 3983 -------------------------------------------------------------------------------
 3984 Returns the square root of the extended double-precision floating-point
 3985 value `a'.  The operation is performed according to the IEC/IEEE Standard
 3986 for Binary Floating-Point Arithmetic.
 3987 -------------------------------------------------------------------------------
 3988 */
 3989 floatx80 floatx80_sqrt( floatx80 a )
 3990 {
 3991     flag aSign;
 3992     int32 aExp, zExp;
 3993     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
 3994     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
 3995     floatx80 z;
 3996 
 3997     aSig0 = extractFloatx80Frac( a );
 3998     aExp = extractFloatx80Exp( a );
 3999     aSign = extractFloatx80Sign( a );
 4000     if ( aExp == 0x7FFF ) {
 4001         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
 4002         if ( ! aSign ) return a;
 4003         goto invalid;
 4004     }
 4005     if ( aSign ) {
 4006         if ( ( aExp | aSig0 ) == 0 ) return a;
 4007  invalid:
 4008         float_raise( float_flag_invalid );
 4009         z.low = floatx80_default_nan_low;
 4010         z.high = floatx80_default_nan_high;
 4011         return z;
 4012     }
 4013     if ( aExp == 0 ) {
 4014         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
 4015         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
 4016     }
 4017     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
 4018     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
 4019     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
 4020     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
 4021     doubleZSig0 = zSig0<<1;
 4022     mul64To128( zSig0, zSig0, &term0, &term1 );
 4023     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
 4024     while ( (sbits64) rem0 < 0 ) {
 4025         --zSig0;
 4026         doubleZSig0 -= 2;
 4027         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
 4028     }
 4029     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
 4030     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
 4031         if ( zSig1 == 0 ) zSig1 = 1;
 4032         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
 4033         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
 4034         mul64To128( zSig1, zSig1, &term2, &term3 );
 4035         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
 4036         while ( (sbits64) rem1 < 0 ) {
 4037             --zSig1;
 4038             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
 4039             term3 |= 1;
 4040             term2 |= doubleZSig0;
 4041             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
 4042         }
 4043         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
 4044     }
 4045     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
 4046     zSig0 |= doubleZSig0;
 4047     return
 4048         roundAndPackFloatx80(
 4049             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
 4050 
 4051 }
 4052 
 4053 /*
 4054 -------------------------------------------------------------------------------
 4055 Returns 1 if the extended double-precision floating-point value `a' is
 4056 equal to the corresponding value `b', and 0 otherwise.  The comparison is
 4057 performed according to the IEC/IEEE Standard for Binary Floating-Point
 4058 Arithmetic.
 4059 -------------------------------------------------------------------------------
 4060 */
 4061 flag floatx80_eq( floatx80 a, floatx80 b )
 4062 {
 4063 
 4064     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
 4065               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
 4066          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
 4067               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
 4068        ) {
 4069         if (    floatx80_is_signaling_nan( a )
 4070              || floatx80_is_signaling_nan( b ) ) {
 4071             float_raise( float_flag_invalid );
 4072         }
 4073         return 0;
 4074     }
 4075     return
 4076            ( a.low == b.low )
 4077         && (    ( a.high == b.high )
 4078              || (    ( a.low == 0 )
 4079                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
 4080            );
 4081 
 4082 }
 4083 
 4084 /*
 4085 -------------------------------------------------------------------------------
 4086 Returns 1 if the extended double-precision floating-point value `a' is
 4087 less than or equal to the corresponding value `b', and 0 otherwise.  The
 4088 comparison is performed according to the IEC/IEEE Standard for Binary
 4089 Floating-Point Arithmetic.
 4090 -------------------------------------------------------------------------------
 4091 */
 4092 flag floatx80_le( floatx80 a, floatx80 b )
 4093 {
 4094     flag aSign, bSign;
 4095 
 4096     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
 4097               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
 4098          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
 4099               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
 4100        ) {
 4101         float_raise( float_flag_invalid );
 4102         return 0;
 4103     }
 4104     aSign = extractFloatx80Sign( a );
 4105     bSign = extractFloatx80Sign( b );
 4106     if ( aSign != bSign ) {
 4107         return
 4108                aSign
 4109             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 4110                  == 0 );
 4111     }
 4112     return
 4113           aSign ? le128( b.high, b.low, a.high, a.low )
 4114         : le128( a.high, a.low, b.high, b.low );
 4115 
 4116 }
 4117 
 4118 /*
 4119 -------------------------------------------------------------------------------
 4120 Returns 1 if the extended double-precision floating-point value `a' is
 4121 less than the corresponding value `b', and 0 otherwise.  The comparison
 4122 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4123 Arithmetic.
 4124 -------------------------------------------------------------------------------
 4125 */
 4126 flag floatx80_lt( floatx80 a, floatx80 b )
 4127 {
 4128     flag aSign, bSign;
 4129 
 4130     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
 4131               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
 4132          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
 4133               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
 4134        ) {
 4135         float_raise( float_flag_invalid );
 4136         return 0;
 4137     }
 4138     aSign = extractFloatx80Sign( a );
 4139     bSign = extractFloatx80Sign( b );
 4140     if ( aSign != bSign ) {
 4141         return
 4142                aSign
 4143             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 4144                  != 0 );
 4145     }
 4146     return
 4147           aSign ? lt128( b.high, b.low, a.high, a.low )
 4148         : lt128( a.high, a.low, b.high, b.low );
 4149 
 4150 }
 4151 
 4152 /*
 4153 -------------------------------------------------------------------------------
 4154 Returns 1 if the extended double-precision floating-point value `a' is equal
 4155 to the corresponding value `b', and 0 otherwise.  The invalid exception is
 4156 raised if either operand is a NaN.  Otherwise, the comparison is performed
 4157 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 4158 -------------------------------------------------------------------------------
 4159 */
 4160 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
 4161 {
 4162 
 4163     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
 4164               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
 4165          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
 4166               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
 4167        ) {
 4168         float_raise( float_flag_invalid );
 4169         return 0;
 4170     }
 4171     return
 4172            ( a.low == b.low )
 4173         && (    ( a.high == b.high )
 4174              || (    ( a.low == 0 )
 4175                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
 4176            );
 4177 
 4178 }
 4179 
 4180 /*
 4181 -------------------------------------------------------------------------------
 4182 Returns 1 if the extended double-precision floating-point value `a' is less
 4183 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
 4184 do not cause an exception.  Otherwise, the comparison is performed according
 4185 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 4186 -------------------------------------------------------------------------------
 4187 */
 4188 flag floatx80_le_quiet( floatx80 a, floatx80 b )
 4189 {
 4190     flag aSign, bSign;
 4191 
 4192     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
 4193               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
 4194          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
 4195               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
 4196        ) {
 4197         if (    floatx80_is_signaling_nan( a )
 4198              || floatx80_is_signaling_nan( b ) ) {
 4199             float_raise( float_flag_invalid );
 4200         }
 4201         return 0;
 4202     }
 4203     aSign = extractFloatx80Sign( a );
 4204     bSign = extractFloatx80Sign( b );
 4205     if ( aSign != bSign ) {
 4206         return
 4207                aSign
 4208             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 4209                  == 0 );
 4210     }
 4211     return
 4212           aSign ? le128( b.high, b.low, a.high, a.low )
 4213         : le128( a.high, a.low, b.high, b.low );
 4214 
 4215 }
 4216 
 4217 /*
 4218 -------------------------------------------------------------------------------
 4219 Returns 1 if the extended double-precision floating-point value `a' is less
 4220 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
 4221 an exception.  Otherwise, the comparison is performed according to the
 4222 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 4223 -------------------------------------------------------------------------------
 4224 */
 4225 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
 4226 {
 4227     flag aSign, bSign;
 4228 
 4229     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
 4230               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
 4231          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
 4232               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
 4233        ) {
 4234         if (    floatx80_is_signaling_nan( a )
 4235              || floatx80_is_signaling_nan( b ) ) {
 4236             float_raise( float_flag_invalid );
 4237         }
 4238         return 0;
 4239     }
 4240     aSign = extractFloatx80Sign( a );
 4241     bSign = extractFloatx80Sign( b );
 4242     if ( aSign != bSign ) {
 4243         return
 4244                aSign
 4245             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 4246                  != 0 );
 4247     }
 4248     return
 4249           aSign ? lt128( b.high, b.low, a.high, a.low )
 4250         : lt128( a.high, a.low, b.high, b.low );
 4251 
 4252 }
 4253 
 4254 #endif
 4255 
 4256 #ifdef FLOAT128
 4257 
 4258 /*
 4259 -------------------------------------------------------------------------------
 4260 Returns the result of converting the quadruple-precision floating-point
 4261 value `a' to the 32-bit two's complement integer format.  The conversion
 4262 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4263 Arithmetic---which means in particular that the conversion is rounded
 4264 according to the current rounding mode.  If `a' is a NaN, the largest
 4265 positive integer is returned.  Otherwise, if the conversion overflows, the
 4266 largest integer with the same sign as `a' is returned.
 4267 -------------------------------------------------------------------------------
 4268 */
 4269 int32 float128_to_int32( float128 a )
 4270 {
 4271     flag aSign;
 4272     int32 aExp, shiftCount;
 4273     bits64 aSig0, aSig1;
 4274 
 4275     aSig1 = extractFloat128Frac1( a );
 4276     aSig0 = extractFloat128Frac0( a );
 4277     aExp = extractFloat128Exp( a );
 4278     aSign = extractFloat128Sign( a );
 4279     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
 4280     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
 4281     aSig0 |= ( aSig1 != 0 );
 4282     shiftCount = 0x4028 - aExp;
 4283     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
 4284     return roundAndPackInt32( aSign, aSig0 );
 4285 
 4286 }
 4287 
 4288 /*
 4289 -------------------------------------------------------------------------------
 4290 Returns the result of converting the quadruple-precision floating-point
 4291 value `a' to the 32-bit two's complement integer format.  The conversion
 4292 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4293 Arithmetic, except that the conversion is always rounded toward zero.  If
 4294 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
 4295 conversion overflows, the largest integer with the same sign as `a' is
 4296 returned.
 4297 -------------------------------------------------------------------------------
 4298 */
 4299 int32 float128_to_int32_round_to_zero( float128 a )
 4300 {
 4301     flag aSign;
 4302     int32 aExp, shiftCount;
 4303     bits64 aSig0, aSig1, savedASig;
 4304     int32 z;
 4305 
 4306     aSig1 = extractFloat128Frac1( a );
 4307     aSig0 = extractFloat128Frac0( a );
 4308     aExp = extractFloat128Exp( a );
 4309     aSign = extractFloat128Sign( a );
 4310     aSig0 |= ( aSig1 != 0 );
 4311     if ( 0x401E < aExp ) {
 4312         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
 4313         goto invalid;
 4314     }
 4315     else if ( aExp < 0x3FFF ) {
 4316         if ( aExp || aSig0 ) float_set_inexact();
 4317         return 0;
 4318     }
 4319     aSig0 |= LIT64( 0x0001000000000000 );
 4320     shiftCount = 0x402F - aExp;
 4321     savedASig = aSig0;
 4322     aSig0 >>= shiftCount;
 4323     z = aSig0;
 4324     if ( aSign ) z = - z;
 4325     if ( ( z < 0 ) ^ aSign ) {
 4326  invalid:
 4327         float_raise( float_flag_invalid );
 4328         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 4329     }
 4330     if ( ( aSig0<<shiftCount ) != savedASig ) {
 4331         float_set_inexact();
 4332     }
 4333     return z;
 4334 
 4335 }
 4336 
 4337 /*
 4338 -------------------------------------------------------------------------------
 4339 Returns the result of converting the quadruple-precision floating-point
 4340 value `a' to the 64-bit two's complement integer format.  The conversion
 4341 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4342 Arithmetic---which means in particular that the conversion is rounded
 4343 according to the current rounding mode.  If `a' is a NaN, the largest
 4344 positive integer is returned.  Otherwise, if the conversion overflows, the
 4345 largest integer with the same sign as `a' is returned.
 4346 -------------------------------------------------------------------------------
 4347 */
 4348 int64 float128_to_int64( float128 a )
 4349 {
 4350     flag aSign;
 4351     int32 aExp, shiftCount;
 4352     bits64 aSig0, aSig1;
 4353 
 4354     aSig1 = extractFloat128Frac1( a );
 4355     aSig0 = extractFloat128Frac0( a );
 4356     aExp = extractFloat128Exp( a );
 4357     aSign = extractFloat128Sign( a );
 4358     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
 4359     shiftCount = 0x402F - aExp;
 4360     if ( shiftCount <= 0 ) {
 4361         if ( 0x403E < aExp ) {
 4362             float_raise( float_flag_invalid );
 4363             if (    ! aSign
 4364                  || (    ( aExp == 0x7FFF )
 4365                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
 4366                     )
 4367                ) {
 4368                 return LIT64( 0x7FFFFFFFFFFFFFFF );
 4369             }
 4370             return (sbits64) LIT64( 0x8000000000000000 );
 4371         }
 4372         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
 4373     }
 4374     else {
 4375         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
 4376     }
 4377     return roundAndPackInt64( aSign, aSig0, aSig1 );
 4378 
 4379 }
 4380 
 4381 /*
 4382 -------------------------------------------------------------------------------
 4383 Returns the result of converting the quadruple-precision floating-point
 4384 value `a' to the 64-bit two's complement integer format.  The conversion
 4385 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4386 Arithmetic, except that the conversion is always rounded toward zero.
 4387 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
 4388 the conversion overflows, the largest integer with the same sign as `a' is
 4389 returned.
 4390 -------------------------------------------------------------------------------
 4391 */
 4392 int64 float128_to_int64_round_to_zero( float128 a )
 4393 {
 4394     flag aSign;
 4395     int32 aExp, shiftCount;
 4396     bits64 aSig0, aSig1;
 4397     int64 z;
 4398 
 4399     aSig1 = extractFloat128Frac1( a );
 4400     aSig0 = extractFloat128Frac0( a );
 4401     aExp = extractFloat128Exp( a );
 4402     aSign = extractFloat128Sign( a );
 4403     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
 4404     shiftCount = aExp - 0x402F;
 4405     if ( 0 < shiftCount ) {
 4406         if ( 0x403E <= aExp ) {
 4407             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
 4408             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
 4409                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
 4410                 if ( aSig1 ) float_set_inexact();
 4411             }
 4412             else {
 4413                 float_raise( float_flag_invalid );
 4414                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
 4415                     return LIT64( 0x7FFFFFFFFFFFFFFF );
 4416                 }
 4417             }
 4418             return (sbits64) LIT64( 0x8000000000000000 );
 4419         }
 4420         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
 4421         if ( (bits64) ( aSig1<<shiftCount ) ) {
 4422             float_set_inexact();
 4423         }
 4424     }
 4425     else {
 4426         if ( aExp < 0x3FFF ) {
 4427             if ( aExp | aSig0 | aSig1 ) {
 4428                 float_set_inexact();
 4429             }
 4430             return 0;
 4431         }
 4432         z = aSig0>>( - shiftCount );
 4433         if (    aSig1
 4434              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
 4435             float_set_inexact();
 4436         }
 4437     }
 4438     if ( aSign ) z = - z;
 4439     return z;
 4440 
 4441 }
 4442 
 4443 /*
 4444 -------------------------------------------------------------------------------
 4445 Returns the result of converting the quadruple-precision floating-point
 4446 value `a' to the single-precision floating-point format.  The conversion
 4447 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4448 Arithmetic.
 4449 -------------------------------------------------------------------------------
 4450 */
 4451 float32 float128_to_float32( float128 a )
 4452 {
 4453     flag aSign;
 4454     int32 aExp;
 4455     bits64 aSig0, aSig1;
 4456     bits32 zSig;
 4457 
 4458     aSig1 = extractFloat128Frac1( a );
 4459     aSig0 = extractFloat128Frac0( a );
 4460     aExp = extractFloat128Exp( a );
 4461     aSign = extractFloat128Sign( a );
 4462     if ( aExp == 0x7FFF ) {
 4463         if ( aSig0 | aSig1 ) {
 4464             return commonNaNToFloat32( float128ToCommonNaN( a ) );
 4465         }
 4466         return packFloat32( aSign, 0xFF, 0 );
 4467     }
 4468     aSig0 |= ( aSig1 != 0 );
 4469     shift64RightJamming( aSig0, 18, &aSig0 );
 4470     zSig = aSig0;
 4471     if ( aExp || zSig ) {
 4472         zSig |= 0x40000000;
 4473         aExp -= 0x3F81;
 4474     }
 4475     return roundAndPackFloat32( aSign, aExp, zSig );
 4476 
 4477 }
 4478 
 4479 /*
 4480 -------------------------------------------------------------------------------
 4481 Returns the result of converting the quadruple-precision floating-point
 4482 value `a' to the double-precision floating-point format.  The conversion
 4483 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 4484 Arithmetic.
 4485 -------------------------------------------------------------------------------
 4486 */
 4487 float64 float128_to_float64( float128 a )
 4488 {
 4489     flag aSign;
 4490     int32 aExp;
 4491     bits64 aSig0, aSig1;
 4492 
 4493     aSig1 = extractFloat128Frac1( a );
 4494     aSig0 = extractFloat128Frac0( a );
 4495     aExp = extractFloat128Exp( a );
 4496     aSign = extractFloat128Sign( a );
 4497     if ( aExp == 0x7FFF ) {
 4498         if ( aSig0 | aSig1 ) {
 4499             return commonNaNToFloat64( float128ToCommonNaN( a ) );
 4500         }
 4501         return packFloat64( aSign, 0x7FF, 0 );
 4502     }
 4503     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
 4504     aSig0 |= ( aSig1 != 0 );
 4505     if ( aExp || aSig0 ) {
 4506         aSig0 |= LIT64( 0x4000000000000000 );
 4507         aExp -= 0x3C01;
 4508     }
 4509     return roundAndPackFloat64( aSign, aExp, aSig0 );
 4510 
 4511 }
 4512 
 4513 #ifdef FLOATX80
 4514 
 4515 /*
 4516 -------------------------------------------------------------------------------
 4517 Returns the result of converting the quadruple-precision floating-point
 4518 value `a' to the extended double-precision floating-point format.  The
 4519 conversion is performed according to the IEC/IEEE Standard for Binary
 4520 Floating-Point Arithmetic.
 4521 -------------------------------------------------------------------------------
 4522 */
 4523 floatx80 float128_to_floatx80( float128 a )
 4524 {
 4525     flag aSign;
 4526     int32 aExp;
 4527     bits64 aSig0, aSig1;
 4528 
 4529     aSig1 = extractFloat128Frac1( a );
 4530     aSig0 = extractFloat128Frac0( a );
 4531     aExp = extractFloat128Exp( a );
 4532     aSign = extractFloat128Sign( a );
 4533     if ( aExp == 0x7FFF ) {
 4534         if ( aSig0 | aSig1 ) {
 4535             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
 4536         }
 4537         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 4538     }
 4539     if ( aExp == 0 ) {
 4540         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
 4541         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
 4542     }
 4543     else {
 4544         aSig0 |= LIT64( 0x0001000000000000 );
 4545     }
 4546     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
 4547     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
 4548 
 4549 }
 4550 
 4551 #endif
 4552 
 4553 /*
 4554 -------------------------------------------------------------------------------
 4555 Rounds the quadruple-precision floating-point value `a' to an integer, and
 4556 returns the result as a quadruple-precision floating-point value.  The
 4557 operation is performed according to the IEC/IEEE Standard for Binary
 4558 Floating-Point Arithmetic.
 4559 -------------------------------------------------------------------------------
 4560 */
 4561 float128 float128_round_to_int( float128 a )
 4562 {
 4563     flag aSign;
 4564     int32 aExp;
 4565     bits64 lastBitMask, roundBitsMask;
 4566     int8 roundingMode;
 4567     float128 z;
 4568 
 4569     aExp = extractFloat128Exp( a );
 4570     if ( 0x402F <= aExp ) {
 4571         if ( 0x406F <= aExp ) {
 4572             if (    ( aExp == 0x7FFF )
 4573                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
 4574                ) {
 4575                 return propagateFloat128NaN( a, a );
 4576             }
 4577             return a;
 4578         }
 4579         lastBitMask = 1;
 4580         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
 4581         roundBitsMask = lastBitMask - 1;
 4582         z = a;
 4583         roundingMode = float_rounding_mode();
 4584         if ( roundingMode == float_round_nearest_even ) {
 4585             if ( lastBitMask ) {
 4586                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
 4587                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
 4588             }
 4589             else {
 4590                 if ( (sbits64) z.low < 0 ) {
 4591                     ++z.high;
 4592                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
 4593                 }
 4594             }
 4595         }
 4596         else if ( roundingMode != float_round_to_zero ) {
 4597             if (   extractFloat128Sign( z )
 4598                  ^ ( roundingMode == float_round_up ) ) {
 4599                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
 4600             }
 4601         }
 4602         z.low &= ~ roundBitsMask;
 4603     }
 4604     else {
 4605         if ( aExp < 0x3FFF ) {
 4606             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
 4607             float_set_inexact();
 4608             aSign = extractFloat128Sign( a );
 4609             switch ( float_rounding_mode() ) {
 4610              case float_round_nearest_even:
 4611                 if (    ( aExp == 0x3FFE )
 4612                      && (   extractFloat128Frac0( a )
 4613                           | extractFloat128Frac1( a ) )
 4614                    ) {
 4615                     return packFloat128( aSign, 0x3FFF, 0, 0 );
 4616                 }
 4617                 break;
 4618              case float_round_down:
 4619                 return
 4620                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
 4621                     : packFloat128( 0, 0, 0, 0 );
 4622              case float_round_up:
 4623                 return
 4624                       aSign ? packFloat128( 1, 0, 0, 0 )
 4625                     : packFloat128( 0, 0x3FFF, 0, 0 );
 4626             }
 4627             return packFloat128( aSign, 0, 0, 0 );
 4628         }
 4629         lastBitMask = 1;
 4630         lastBitMask <<= 0x402F - aExp;
 4631         roundBitsMask = lastBitMask - 1;
 4632         z.low = 0;
 4633         z.high = a.high;
 4634         roundingMode = float_rounding_mode();
 4635         if ( roundingMode == float_round_nearest_even ) {
 4636             z.high += lastBitMask>>1;
 4637             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
 4638                 z.high &= ~ lastBitMask;
 4639             }
 4640         }
 4641         else if ( roundingMode != float_round_to_zero ) {
 4642             if (   extractFloat128Sign( z )
 4643                  ^ ( roundingMode == float_round_up ) ) {
 4644                 z.high |= ( a.low != 0 );
 4645                 z.high += roundBitsMask;
 4646             }
 4647         }
 4648         z.high &= ~ roundBitsMask;
 4649     }
 4650     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
 4651         float_set_inexact();
 4652     }
 4653     return z;
 4654 
 4655 }
 4656 
 4657 /*
 4658 -------------------------------------------------------------------------------
 4659 Returns the result of adding the absolute values of the quadruple-precision
 4660 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
 4661 before being returned.  `zSign' is ignored if the result is a NaN.
 4662 The addition is performed according to the IEC/IEEE Standard for Binary
 4663 Floating-Point Arithmetic.
 4664 -------------------------------------------------------------------------------
 4665 */
 4666 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
 4667 {
 4668     int32 aExp, bExp, zExp;
 4669     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
 4670     int32 expDiff;
 4671 
 4672     aSig1 = extractFloat128Frac1( a );
 4673     aSig0 = extractFloat128Frac0( a );
 4674     aExp = extractFloat128Exp( a );
 4675     bSig1 = extractFloat128Frac1( b );
 4676     bSig0 = extractFloat128Frac0( b );
 4677     bExp = extractFloat128Exp( b );
 4678     expDiff = aExp - bExp;
 4679     if ( 0 < expDiff ) {
 4680         if ( aExp == 0x7FFF ) {
 4681             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
 4682             return a;
 4683         }
 4684         if ( bExp == 0 ) {
 4685             --expDiff;
 4686         }
 4687         else {
 4688             bSig0 |= LIT64( 0x0001000000000000 );
 4689         }
 4690         shift128ExtraRightJamming(
 4691             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
 4692         zExp = aExp;
 4693     }
 4694     else if ( expDiff < 0 ) {
 4695         if ( bExp == 0x7FFF ) {
 4696             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
 4697             return packFloat128( zSign, 0x7FFF, 0, 0 );
 4698         }
 4699         if ( aExp == 0 ) {
 4700             ++expDiff;
 4701         }
 4702         else {
 4703             aSig0 |= LIT64( 0x0001000000000000 );
 4704         }
 4705         shift128ExtraRightJamming(
 4706             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
 4707         zExp = bExp;
 4708     }
 4709     else {
 4710         if ( aExp == 0x7FFF ) {
 4711             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
 4712                 return propagateFloat128NaN( a, b );
 4713             }
 4714             return a;
 4715         }
 4716         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
 4717         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
 4718         zSig2 = 0;
 4719         zSig0 |= LIT64( 0x0002000000000000 );
 4720         zExp = aExp;
 4721         goto shiftRight1;
 4722     }
 4723     aSig0 |= LIT64( 0x0001000000000000 );
 4724     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
 4725     --zExp;
 4726     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
 4727     ++zExp;
 4728  shiftRight1:
 4729     shift128ExtraRightJamming(
 4730         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
 4731  roundAndPack:
 4732     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
 4733 
 4734 }
 4735 
 4736 /*
 4737 -------------------------------------------------------------------------------
 4738 Returns the result of subtracting the absolute values of the quadruple-
 4739 precision floating-point values `a' and `b'.  If `zSign' is 1, the
 4740 difference is negated before being returned.  `zSign' is ignored if the
 4741 result is a NaN.  The subtraction is performed according to the IEC/IEEE
 4742 Standard for Binary Floating-Point Arithmetic.
 4743 -------------------------------------------------------------------------------
 4744 */
 4745 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
 4746 {
 4747     int32 aExp, bExp, zExp;
 4748     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
 4749     int32 expDiff;
 4750     float128 z;
 4751 
 4752     aSig1 = extractFloat128Frac1( a );
 4753     aSig0 = extractFloat128Frac0( a );
 4754     aExp = extractFloat128Exp( a );
 4755     bSig1 = extractFloat128Frac1( b );
 4756     bSig0 = extractFloat128Frac0( b );
 4757     bExp = extractFloat128Exp( b );
 4758     expDiff = aExp - bExp;
 4759     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
 4760     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
 4761     if ( 0 < expDiff ) goto aExpBigger;
 4762     if ( expDiff < 0 ) goto bExpBigger;
 4763     if ( aExp == 0x7FFF ) {
 4764         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
 4765             return propagateFloat128NaN( a, b );
 4766         }
 4767         float_raise( float_flag_invalid );
 4768         z.low = float128_default_nan_low;
 4769         z.high = float128_default_nan_high;
 4770         return z;
 4771     }
 4772     if ( aExp == 0 ) {
 4773         aExp = 1;
 4774         bExp = 1;
 4775     }
 4776     if ( bSig0 < aSig0 ) goto aBigger;
 4777     if ( aSig0 < bSig0 ) goto bBigger;
 4778     if ( bSig1 < aSig1 ) goto aBigger;
 4779     if ( aSig1 < bSig1 ) goto bBigger;
 4780     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
 4781  bExpBigger:
 4782     if ( bExp == 0x7FFF ) {
 4783         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
 4784         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
 4785     }
 4786     if ( aExp == 0 ) {
 4787         ++expDiff;
 4788     }
 4789     else {
 4790         aSig0 |= LIT64( 0x4000000000000000 );
 4791     }
 4792     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
 4793     bSig0 |= LIT64( 0x4000000000000000 );
 4794  bBigger:
 4795     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
 4796     zExp = bExp;
 4797     zSign ^= 1;
 4798     goto normalizeRoundAndPack;
 4799  aExpBigger:
 4800     if ( aExp == 0x7FFF ) {
 4801         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
 4802         return a;
 4803     }
 4804     if ( bExp == 0 ) {
 4805         --expDiff;
 4806     }
 4807     else {
 4808         bSig0 |= LIT64( 0x4000000000000000 );
 4809     }
 4810     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
 4811     aSig0 |= LIT64( 0x4000000000000000 );
 4812  aBigger:
 4813     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
 4814     zExp = aExp;
 4815  normalizeRoundAndPack:
 4816     --zExp;
 4817     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
 4818 
 4819 }
 4820 
 4821 /*
 4822 -------------------------------------------------------------------------------
 4823 Returns the result of adding the quadruple-precision floating-point values
 4824 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
 4825 for Binary Floating-Point Arithmetic.
 4826 -------------------------------------------------------------------------------
 4827 */
 4828 float128 float128_add( float128 a, float128 b )
 4829 {
 4830     flag aSign, bSign;
 4831 
 4832     aSign = extractFloat128Sign( a );
 4833     bSign = extractFloat128Sign( b );
 4834     if ( aSign == bSign ) {
 4835         return addFloat128Sigs( a, b, aSign );
 4836     }
 4837     else {
 4838         return subFloat128Sigs( a, b, aSign );
 4839     }
 4840 
 4841 }
 4842 
 4843 /*
 4844 -------------------------------------------------------------------------------
 4845 Returns the result of subtracting the quadruple-precision floating-point
 4846 values `a' and `b'.  The operation is performed according to the IEC/IEEE
 4847 Standard for Binary Floating-Point Arithmetic.
 4848 -------------------------------------------------------------------------------
 4849 */
 4850 float128 float128_sub( float128 a, float128 b )
 4851 {
 4852     flag aSign, bSign;
 4853 
 4854     aSign = extractFloat128Sign( a );
 4855     bSign = extractFloat128Sign( b );
 4856     if ( aSign == bSign ) {
 4857         return subFloat128Sigs( a, b, aSign );
 4858     }
 4859     else {
 4860         return addFloat128Sigs( a, b, aSign );
 4861     }
 4862 
 4863 }
 4864 
 4865 /*
 4866 -------------------------------------------------------------------------------
 4867 Returns the result of multiplying the quadruple-precision floating-point
 4868 values `a' and `b'.  The operation is performed according to the IEC/IEEE
 4869 Standard for Binary Floating-Point Arithmetic.
 4870 -------------------------------------------------------------------------------
 4871 */
 4872 float128 float128_mul( float128 a, float128 b )
 4873 {
 4874     flag aSign, bSign, zSign;
 4875     int32 aExp, bExp, zExp;
 4876     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
 4877     float128 z;
 4878 
 4879     aSig1 = extractFloat128Frac1( a );
 4880     aSig0 = extractFloat128Frac0( a );
 4881     aExp = extractFloat128Exp( a );
 4882     aSign = extractFloat128Sign( a );
 4883     bSig1 = extractFloat128Frac1( b );
 4884     bSig0 = extractFloat128Frac0( b );
 4885     bExp = extractFloat128Exp( b );
 4886     bSign = extractFloat128Sign( b );
 4887     zSign = aSign ^ bSign;
 4888     if ( aExp == 0x7FFF ) {
 4889         if (    ( aSig0 | aSig1 )
 4890              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
 4891             return propagateFloat128NaN( a, b );
 4892         }
 4893         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
 4894         return packFloat128( zSign, 0x7FFF, 0, 0 );
 4895     }
 4896     if ( bExp == 0x7FFF ) {
 4897         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
 4898         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
 4899  invalid:
 4900             float_raise( float_flag_invalid );
 4901             z.low = float128_default_nan_low;
 4902             z.high = float128_default_nan_high;
 4903             return z;
 4904         }
 4905         return packFloat128( zSign, 0x7FFF, 0, 0 );
 4906     }
 4907     if ( aExp == 0 ) {
 4908         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
 4909         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
 4910     }
 4911     if ( bExp == 0 ) {
 4912         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
 4913         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
 4914     }
 4915     zExp = aExp + bExp - 0x4000;
 4916     aSig0 |= LIT64( 0x0001000000000000 );
 4917     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
 4918     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
 4919     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
 4920     zSig2 |= ( zSig3 != 0 );
 4921     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
 4922         shift128ExtraRightJamming(
 4923             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
 4924         ++zExp;
 4925     }
 4926     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
 4927 
 4928 }
 4929 
 4930 /*
 4931 -------------------------------------------------------------------------------
 4932 Returns the result of dividing the quadruple-precision floating-point value
 4933 `a' by the corresponding value `b'.  The operation is performed according to
 4934 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 4935 -------------------------------------------------------------------------------
 4936 */
 4937 float128 float128_div( float128 a, float128 b )
 4938 {
 4939     flag aSign, bSign, zSign;
 4940     int32 aExp, bExp, zExp;
 4941     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
 4942     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
 4943     float128 z;
 4944 
 4945     aSig1 = extractFloat128Frac1( a );
 4946     aSig0 = extractFloat128Frac0( a );
 4947     aExp = extractFloat128Exp( a );
 4948     aSign = extractFloat128Sign( a );
 4949     bSig1 = extractFloat128Frac1( b );
 4950     bSig0 = extractFloat128Frac0( b );
 4951     bExp = extractFloat128Exp( b );
 4952     bSign = extractFloat128Sign( b );
 4953     zSign = aSign ^ bSign;
 4954     if ( aExp == 0x7FFF ) {
 4955         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
 4956         if ( bExp == 0x7FFF ) {
 4957             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
 4958             goto invalid;
 4959         }
 4960         return packFloat128( zSign, 0x7FFF, 0, 0 );
 4961     }
 4962     if ( bExp == 0x7FFF ) {
 4963         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
 4964         return packFloat128( zSign, 0, 0, 0 );
 4965     }
 4966     if ( bExp == 0 ) {
 4967         if ( ( bSig0 | bSig1 ) == 0 ) {
 4968             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
 4969  invalid:
 4970                 float_raise( float_flag_invalid );
 4971                 z.low = float128_default_nan_low;
 4972                 z.high = float128_default_nan_high;
 4973                 return z;
 4974             }
 4975             float_raise( float_flag_divbyzero );
 4976             return packFloat128( zSign, 0x7FFF, 0, 0 );
 4977         }
 4978         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
 4979     }
 4980     if ( aExp == 0 ) {
 4981         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
 4982         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
 4983     }
 4984     zExp = aExp - bExp + 0x3FFD;
 4985     shortShift128Left(
 4986         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
 4987     shortShift128Left(
 4988         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
 4989     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
 4990         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
 4991         ++zExp;
 4992     }
 4993     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
 4994     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
 4995     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
 4996     while ( (sbits64) rem0 < 0 ) {
 4997         --zSig0;
 4998         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
 4999     }
 5000     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
 5001     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
 5002         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
 5003         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
 5004         while ( (sbits64) rem1 < 0 ) {
 5005             --zSig1;
 5006             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
 5007         }
 5008         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
 5009     }
 5010     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
 5011     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
 5012 
 5013 }
 5014 
 5015 /*
 5016 -------------------------------------------------------------------------------
 5017 Returns the remainder of the quadruple-precision floating-point value `a'
 5018 with respect to the corresponding value `b'.  The operation is performed
 5019 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 5020 -------------------------------------------------------------------------------
 5021 */
 5022 float128 float128_rem( float128 a, float128 b )
 5023 {
 5024     flag aSign, bSign, zSign;
 5025     int32 aExp, bExp, expDiff;
 5026     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
 5027     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
 5028     sbits64 sigMean0;
 5029     float128 z;
 5030 
 5031     aSig1 = extractFloat128Frac1( a );
 5032     aSig0 = extractFloat128Frac0( a );
 5033     aExp = extractFloat128Exp( a );
 5034     aSign = extractFloat128Sign( a );
 5035     bSig1 = extractFloat128Frac1( b );
 5036     bSig0 = extractFloat128Frac0( b );
 5037     bExp = extractFloat128Exp( b );
 5038     bSign = extractFloat128Sign( b );
 5039     if ( aExp == 0x7FFF ) {
 5040         if (    ( aSig0 | aSig1 )
 5041              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
 5042             return propagateFloat128NaN( a, b );
 5043         }
 5044         goto invalid;
 5045     }
 5046     if ( bExp == 0x7FFF ) {
 5047         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
 5048         return a;
 5049     }
 5050     if ( bExp == 0 ) {
 5051         if ( ( bSig0 | bSig1 ) == 0 ) {
 5052  invalid:
 5053             float_raise( float_flag_invalid );
 5054             z.low = float128_default_nan_low;
 5055             z.high = float128_default_nan_high;
 5056             return z;
 5057         }
 5058         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
 5059     }
 5060     if ( aExp == 0 ) {
 5061         if ( ( aSig0 | aSig1 ) == 0 ) return a;
 5062         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
 5063     }
 5064     expDiff = aExp - bExp;
 5065     if ( expDiff < -1 ) return a;
 5066     shortShift128Left(
 5067         aSig0 | LIT64( 0x0001000000000000 ),
 5068         aSig1,
 5069         15 - ( expDiff < 0 ),
 5070         &aSig0,
 5071         &aSig1
 5072     );
 5073     shortShift128Left(
 5074         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
 5075     q = le128( bSig0, bSig1, aSig0, aSig1 );
 5076     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
 5077     expDiff -= 64;
 5078     while ( 0 < expDiff ) {
 5079         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
 5080         q = ( 4 < q ) ? q - 4 : 0;
 5081         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
 5082         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
 5083         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
 5084         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
 5085         expDiff -= 61;
 5086     }
 5087     if ( -64 < expDiff ) {
 5088         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
 5089         q = ( 4 < q ) ? q - 4 : 0;
 5090         q >>= - expDiff;
 5091         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
 5092         expDiff += 52;
 5093         if ( expDiff < 0 ) {
 5094             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
 5095         }
 5096         else {
 5097             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
 5098         }
 5099         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
 5100         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
 5101     }
 5102     else {
 5103         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
 5104         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
 5105     }
 5106     do {
 5107         alternateASig0 = aSig0;
 5108         alternateASig1 = aSig1;
 5109         ++q;
 5110         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
 5111     } while ( 0 <= (sbits64) aSig0 );
 5112     add128(
 5113         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
 5114     if (    ( sigMean0 < 0 )
 5115          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
 5116         aSig0 = alternateASig0;
 5117         aSig1 = alternateASig1;
 5118     }
 5119     zSign = ( (sbits64) aSig0 < 0 );
 5120     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
 5121     return
 5122         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
 5123 
 5124 }
 5125 
 5126 /*
 5127 -------------------------------------------------------------------------------
 5128 Returns the square root of the quadruple-precision floating-point value `a'.
 5129 The operation is performed according to the IEC/IEEE Standard for Binary
 5130 Floating-Point Arithmetic.
 5131 -------------------------------------------------------------------------------
 5132 */
 5133 float128 float128_sqrt( float128 a )
 5134 {
 5135     flag aSign;
 5136     int32 aExp, zExp;
 5137     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
 5138     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
 5139     float128 z;
 5140 
 5141     aSig1 = extractFloat128Frac1( a );
 5142     aSig0 = extractFloat128Frac0( a );
 5143     aExp = extractFloat128Exp( a );
 5144     aSign = extractFloat128Sign( a );
 5145     if ( aExp == 0x7FFF ) {
 5146         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
 5147         if ( ! aSign ) return a;
 5148         goto invalid;
 5149     }
 5150     if ( aSign ) {
 5151         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
 5152  invalid:
 5153         float_raise( float_flag_invalid );
 5154         z.low = float128_default_nan_low;
 5155         z.high = float128_default_nan_high;
 5156         return z;
 5157     }
 5158     if ( aExp == 0 ) {
 5159         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
 5160         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
 5161     }
 5162     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
 5163     aSig0 |= LIT64( 0x0001000000000000 );
 5164     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
 5165     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
 5166     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
 5167     doubleZSig0 = zSig0<<1;
 5168     mul64To128( zSig0, zSig0, &term0, &term1 );
 5169     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
 5170     while ( (sbits64) rem0 < 0 ) {
 5171         --zSig0;
 5172         doubleZSig0 -= 2;
 5173         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
 5174     }
 5175     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
 5176     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
 5177         if ( zSig1 == 0 ) zSig1 = 1;
 5178         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
 5179         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
 5180         mul64To128( zSig1, zSig1, &term2, &term3 );
 5181         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
 5182         while ( (sbits64) rem1 < 0 ) {
 5183             --zSig1;
 5184             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
 5185             term3 |= 1;
 5186             term2 |= doubleZSig0;
 5187             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
 5188         }
 5189         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
 5190     }
 5191     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
 5192     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
 5193 
 5194 }
 5195 
 5196 /*
 5197 -------------------------------------------------------------------------------
 5198 Returns 1 if the quadruple-precision floating-point value `a' is equal to
 5199 the corresponding value `b', and 0 otherwise.  The comparison is performed
 5200 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 5201 -------------------------------------------------------------------------------
 5202 */
 5203 flag float128_eq( float128 a, float128 b )
 5204 {
 5205 
 5206     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
 5207               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
 5208          || (    ( extractFloat128Exp( b ) == 0x7FFF )
 5209               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
 5210        ) {
 5211         if (    float128_is_signaling_nan( a )
 5212              || float128_is_signaling_nan( b ) ) {
 5213             float_raise( float_flag_invalid );
 5214         }
 5215         return 0;
 5216     }
 5217     return
 5218            ( a.low == b.low )
 5219         && (    ( a.high == b.high )
 5220              || (    ( a.low == 0 )
 5221                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
 5222            );
 5223 
 5224 }
 5225 
 5226 /*
 5227 -------------------------------------------------------------------------------
 5228 Returns 1 if the quadruple-precision floating-point value `a' is less than
 5229 or equal to the corresponding value `b', and 0 otherwise.  The comparison
 5230 is performed according to the IEC/IEEE Standard for Binary Floating-Point
 5231 Arithmetic.
 5232 -------------------------------------------------------------------------------
 5233 */
 5234 flag float128_le( float128 a, float128 b )
 5235 {
 5236     flag aSign, bSign;
 5237 
 5238     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
 5239               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
 5240          || (    ( extractFloat128Exp( b ) == 0x7FFF )
 5241               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
 5242        ) {
 5243         float_raise( float_flag_invalid );
 5244         return 0;
 5245     }
 5246     aSign = extractFloat128Sign( a );
 5247     bSign = extractFloat128Sign( b );
 5248     if ( aSign != bSign ) {
 5249         return
 5250                aSign
 5251             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 5252                  == 0 );
 5253     }
 5254     return
 5255           aSign ? le128( b.high, b.low, a.high, a.low )
 5256         : le128( a.high, a.low, b.high, b.low );
 5257 
 5258 }
 5259 
 5260 /*
 5261 -------------------------------------------------------------------------------
 5262 Returns 1 if the quadruple-precision floating-point value `a' is less than
 5263 the corresponding value `b', and 0 otherwise.  The comparison is performed
 5264 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 5265 -------------------------------------------------------------------------------
 5266 */
 5267 flag float128_lt( float128 a, float128 b )
 5268 {
 5269     flag aSign, bSign;
 5270 
 5271     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
 5272               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
 5273          || (    ( extractFloat128Exp( b ) == 0x7FFF )
 5274               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
 5275        ) {
 5276         float_raise( float_flag_invalid );
 5277         return 0;
 5278     }
 5279     aSign = extractFloat128Sign( a );
 5280     bSign = extractFloat128Sign( b );
 5281     if ( aSign != bSign ) {
 5282         return
 5283                aSign
 5284             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 5285                  != 0 );
 5286     }
 5287     return
 5288           aSign ? lt128( b.high, b.low, a.high, a.low )
 5289         : lt128( a.high, a.low, b.high, b.low );
 5290 
 5291 }
 5292 
 5293 /*
 5294 -------------------------------------------------------------------------------
 5295 Returns 1 if the quadruple-precision floating-point value `a' is equal to
 5296 the corresponding value `b', and 0 otherwise.  The invalid exception is
 5297 raised if either operand is a NaN.  Otherwise, the comparison is performed
 5298 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 5299 -------------------------------------------------------------------------------
 5300 */
 5301 flag float128_eq_signaling( float128 a, float128 b )
 5302 {
 5303 
 5304     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
 5305               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
 5306          || (    ( extractFloat128Exp( b ) == 0x7FFF )
 5307               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
 5308        ) {
 5309         float_raise( float_flag_invalid );
 5310         return 0;
 5311     }
 5312     return
 5313            ( a.low == b.low )
 5314         && (    ( a.high == b.high )
 5315              || (    ( a.low == 0 )
 5316                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
 5317            );
 5318 
 5319 }
 5320 
 5321 /*
 5322 -------------------------------------------------------------------------------
 5323 Returns 1 if the quadruple-precision floating-point value `a' is less than
 5324 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
 5325 cause an exception.  Otherwise, the comparison is performed according to the
 5326 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 5327 -------------------------------------------------------------------------------
 5328 */
 5329 flag float128_le_quiet( float128 a, float128 b )
 5330 {
 5331     flag aSign, bSign;
 5332 
 5333     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
 5334               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
 5335          || (    ( extractFloat128Exp( b ) == 0x7FFF )
 5336               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
 5337        ) {
 5338         if (    float128_is_signaling_nan( a )
 5339              || float128_is_signaling_nan( b ) ) {
 5340             float_raise( float_flag_invalid );
 5341         }
 5342         return 0;
 5343     }
 5344     aSign = extractFloat128Sign( a );
 5345     bSign = extractFloat128Sign( b );
 5346     if ( aSign != bSign ) {
 5347         return
 5348                aSign
 5349             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 5350                  == 0 );
 5351     }
 5352     return
 5353           aSign ? le128( b.high, b.low, a.high, a.low )
 5354         : le128( a.high, a.low, b.high, b.low );
 5355 
 5356 }
 5357 
 5358 /*
 5359 -------------------------------------------------------------------------------
 5360 Returns 1 if the quadruple-precision floating-point value `a' is less than
 5361 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
 5362 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
 5363 Standard for Binary Floating-Point Arithmetic.
 5364 -------------------------------------------------------------------------------
 5365 */
 5366 flag float128_lt_quiet( float128 a, float128 b )
 5367 {
 5368     flag aSign, bSign;
 5369 
 5370     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
 5371               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
 5372          || (    ( extractFloat128Exp( b ) == 0x7FFF )
 5373               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
 5374        ) {
 5375         if (    float128_is_signaling_nan( a )
 5376              || float128_is_signaling_nan( b ) ) {
 5377             float_raise( float_flag_invalid );
 5378         }
 5379         return 0;
 5380     }
 5381     aSign = extractFloat128Sign( a );
 5382     bSign = extractFloat128Sign( b );
 5383     if ( aSign != bSign ) {
 5384         return
 5385                aSign
 5386             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
 5387                  != 0 );
 5388     }
 5389     return
 5390           aSign ? lt128( b.high, b.low, a.high, a.low )
 5391         : lt128( a.high, a.low, b.high, b.low );
 5392 
 5393 }
 5394 
 5395 #endif
 5396 
 5397 
 5398 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
 5399 
 5400 /*
 5401  * These two routines are not part of the original softfloat distribution.
 5402  *
 5403  * They are based on the corresponding conversions to integer but return
 5404  * unsigned numbers instead since these functions are required by GCC.
 5405  *
 5406  * Added by Mark Brinicombe <mark@NetBSD.org>   27/09/97
 5407  *
 5408  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
 5409  */
 5410 
 5411 /*
 5412 -------------------------------------------------------------------------------
 5413 Returns the result of converting the double-precision floating-point value
 5414 `a' to the 32-bit unsigned integer format.  The conversion is
 5415 performed according to the IEC/IEEE Standard for Binary Floating-point
 5416 Arithmetic, except that the conversion is always rounded toward zero.  If
 5417 `a' is a NaN, the largest positive integer is returned.  If the conversion
 5418 overflows, the largest integer positive is returned.
 5419 -------------------------------------------------------------------------------
 5420 */
 5421 uint32 float64_to_uint32_round_to_zero( float64 a )
 5422 {
 5423     flag aSign;
 5424     int16 aExp, shiftCount;
 5425     bits64 aSig, savedASig;
 5426     uint32 z;
 5427 
 5428     aSig = extractFloat64Frac( a );
 5429     aExp = extractFloat64Exp( a );
 5430     aSign = extractFloat64Sign( a );
 5431 
 5432     if (aSign) {
 5433         float_raise( float_flag_invalid );
 5434         return(0);
 5435     }
 5436 
 5437     if ( 0x41E < aExp ) {
 5438         float_raise( float_flag_invalid );
 5439         return 0xffffffff;
 5440     }
 5441     else if ( aExp < 0x3FF ) {
 5442         if ( aExp || aSig ) float_set_inexact();
 5443         return 0;
 5444     }
 5445     aSig |= LIT64( 0x0010000000000000 );
 5446     shiftCount = 0x433 - aExp;
 5447     savedASig = aSig;
 5448     aSig >>= shiftCount;
 5449     z = aSig;
 5450     if ( ( aSig<<shiftCount ) != savedASig ) {
 5451         float_set_inexact();
 5452     }
 5453     return z;
 5454 
 5455 }
 5456 
 5457 /*
 5458 -------------------------------------------------------------------------------
 5459 Returns the result of converting the single-precision floating-point value
 5460 `a' to the 32-bit unsigned integer format.  The conversion is
 5461 performed according to the IEC/IEEE Standard for Binary Floating-point
 5462 Arithmetic, except that the conversion is always rounded toward zero.  If
 5463 `a' is a NaN, the largest positive integer is returned.  If the conversion
 5464 overflows, the largest positive integer is returned.
 5465 -------------------------------------------------------------------------------
 5466 */
 5467 uint32 float32_to_uint32_round_to_zero( float32 a )
 5468 {
 5469     flag aSign;
 5470     int16 aExp, shiftCount;
 5471     bits32 aSig;
 5472     uint32 z;
 5473 
 5474     aSig = extractFloat32Frac( a );
 5475     aExp = extractFloat32Exp( a );
 5476     aSign = extractFloat32Sign( a );
 5477     shiftCount = aExp - 0x9E;
 5478 
 5479     if (aSign) {
 5480         float_raise( float_flag_invalid );
 5481         return(0);
 5482     }
 5483     if ( 0 < shiftCount ) {
 5484         float_raise( float_flag_invalid );
 5485         return 0xFFFFFFFF;
 5486     }
 5487     else if ( aExp <= 0x7E ) {
 5488         if ( aExp | aSig ) float_set_inexact();
 5489         return 0;
 5490     }
 5491     aSig = ( aSig | 0x800000 )<<8;
 5492     z = aSig>>( - shiftCount );
 5493     if ( aSig<<( shiftCount & 31 ) ) {
 5494         float_set_inexact();
 5495     }
 5496     return z;
 5497 
 5498 }
 5499 
 5500 #endif
 5501 
 5502 #endif /* _STANDALONE */

Cache object: dbb6d0ca95b0141444119bde0674e45e


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.