Re: [Boost-bugs] [Boost C++ Libraries] #12527: cpp_bin_float: Anal fixation. Part 3. Double rounding when result of convert_to<double>() is a subnormal

Subject: Re: [Boost-bugs] [Boost C++ Libraries] #12527: cpp_bin_float: Anal fixation. Part 3. Double rounding when result of convert_to<double>() is a subnormal
From: Boost C++ Libraries (noreply_at_[hidden])
Date: 2016-11-29 22:43:55


#12527: cpp_bin_float: Anal fixation. Part 3. Double rounding when result of
convert_to<double>() is a subnormal
-------------------------------+----------------------------
  Reporter: Michael Shatz | Owner: johnmaddock
      Type: Bugs | Status: reopened
 Milestone: To Be Determined | Component: multiprecision
   Version: Boost 1.62.0 | Severity: Problem
Resolution: | Keywords:
-------------------------------+----------------------------

Comment (by Michael Shatz):

 Just for fun, I coded "light-speed" variant of conversion routine, that
 detects and specializes cases of 'Float' equivalent to IEEE-754 binary32
 and binary64 and uses bit-level knowledge of this formats in order to
 implement conversion that does not use FPU at all.
 Enjoy:


 {{{
 template <class Float, unsigned Digits, digit_base_type DigitBase, class
 Allocator, class Exponent, Exponent MinE, Exponent MaxE>
 inline typename boost::enable_if_c<boost::is_float<Float>::value>::type
 eval_convert_to(Float *res, const cpp_bin_float<Digits, DigitBase,
 Allocator, Exponent, MinE, MaxE> &original_arg)
 {
    typedef cpp_bin_float<std::numeric_limits<Float>::digits, digit_base_2,
 void, Exponent, MinE, MaxE> conv_type;
    typedef typename common_type<typename conv_type::exponent_type,
 int>::type common_exp_type;
    //
    // Special cases first:
    //
    switch(original_arg.exponent())
    {
    case cpp_bin_float<Digits, DigitBase, Allocator, Exponent, MinE,
 MaxE>::exponent_zero:
       *res = 0;
       if(original_arg.sign())
          *res = -*res;
       return;
    case cpp_bin_float<Digits, DigitBase, Allocator, Exponent, MinE,
 MaxE>::exponent_nan:
       *res = std::numeric_limits<Float>::quiet_NaN();
       return;
    case cpp_bin_float<Digits, DigitBase, Allocator, Exponent, MinE,
 MaxE>::exponent_infinity:
       *res = (std::numeric_limits<Float>::infinity)();
       if(original_arg.sign())
          *res = -*res;
       return;
    }

    if (std::numeric_limits<Float>::radix==2) {
      if (original_arg.exponent() >=
 std::numeric_limits<Float>::max_exponent ||
          original_arg.exponent() <
 std::numeric_limits<Float>::min_exponent -
 std::numeric_limits<Float>::digits -1)
       {
         Float ret =
           original_arg.exponent() >=
 std::numeric_limits<Float>::max_exponent ?
             (std::numeric_limits<Float>::has_infinity ?
               std::numeric_limits<Float>::infinity() :
               std::numeric_limits<Float>::max() ):
             0;
         *res = original_arg.sign() ? -ret : ret;
         return;
       }

       // fast handling of mundane cases
       if (std::numeric_limits<Float>::digits <= 62) {
         // extract 64 MS bits of significand
         typename cpp_bin_float<Digits, DigitBase, Allocator, Exponent,
 MinE, MaxE>::rep_type bits(original_arg.bits());
         if (original_arg.bit_count > 64)
           eval_right_shift(bits, original_arg.bit_count - 64);
         uint64_t msbits;
         eval_convert_to(&msbits, bits);
         if (original_arg.bit_count < 64)
           msbits <<= (64 - original_arg.bit_count) % 64;

         int digits_to_round_to = std::numeric_limits<Float>::digits;
         int e = original_arg.exponent();
         if (e < std::numeric_limits<Float>::min_exponent-1) {
           digits_to_round_to += e -
 (std::numeric_limits<Float>::min_exponent-1);
         }

         const uint64_t offmask = uint64_t(-1) >> digits_to_round_to;
         const uint64_t offmsbit = uint64_t(1) << (63-digits_to_round_to);
         uint64_t offbits = msbits & offmask;
         uint64_t onbits = (msbits >> 1) >> (63-digits_to_round_to);
         offbits |= (onbits & 1); // tie rounded toward even

         if (original_arg.bit_count > 64 && offbits == offmsbit) {
           // Suspect for a tie. We have to examine lower bits of original
           // The method used below is certainly not the fastest possible,
           // but the code is short and does not rely on internal
 representation of cpp_int
           // Taking into account limits of my template-fu, that's not a
 small advantage.
           // I am feeling less guilty about it because in typical cases of
 conversion (double/float)
           // we will not arrive here too often
           bits = original_arg.bits();
           eval_left_shift(bits, 64);
           if (!eval_eq(bits, limb_type(0)))
             offbits |= 1;
         }

         if (((std::numeric_limits<Float>::digits==53 &&
 sizeof(Float)==sizeof(uint64_t)) ||
              (std::numeric_limits<Float>::digits==24 &&
 sizeof(Float)==sizeof(uint32_t))) &&
             std::numeric_limits<Float>::has_denorm == std::denorm_present
 &&
             std::numeric_limits<Float>::is_iec559)
         {
           // 'Float' is IEEE 754 binary64 or IEEE 754 binary32
           // exploit knowledge of bit-level representation of IEEE 754
 binaryXX
           // for extremely fast replacement of ldexp()
           if (e < std::numeric_limits<Float>::min_exponent-1)
             e = std::numeric_limits<Float>::min_exponent-2;
           onbits &= uint64_t(-1) >>
 (65-std::numeric_limits<Float>::digits); // remove hidden bit
           onbits += (offbits > offmsbit);
           const int exp_bias = std::numeric_limits<Float>::digits==24 ?
 127 : 1023;
           onbits += uint64_t(exp_bias+e) <<
 (std::numeric_limits<Float>::digits-1); // '+' rather than '|' , so case
 of carry from significand into exponent handled automatically
           onbits |= uint64_t(original_arg.sign() ? 1 : 0) <<
             (std::numeric_limits<Float>::digits==24 ? 31 : 63);
           if (sizeof(Float)==sizeof(uint32_t)) {
             uint32_t u32 = static_cast<uint32_t>(onbits);
             memcpy(res, &u32, sizeof(*res));
           } else {
             memcpy(res, &onbits, sizeof(*res));
           }
           return;
         }

         onbits += (offbits > offmsbit);
         if (std::numeric_limits<double>::radix==2 &&
             sizeof(uint64_t)==sizeof(double) &&
             std::numeric_limits<double>::digits==53 &&
             std::numeric_limits<double>::has_denorm == std::denorm_present
 &&
             std::numeric_limits<double>::is_iec559 && // platform's
 'double' is IEEE 754 binary64
             std::numeric_limits<Float>::digits <=
 std::numeric_limits<double>::digits &&
             std::numeric_limits<Float>::min_exponent >=
 std::numeric_limits<double>::min_exponent &&
             std::numeric_limits<Float>::max_exponent <=
 std::numeric_limits<double>::max_exponent) {
           // 'Float' is a subset of 'double'
           // exploit knowledge of bit-level representation of IEEE 754
 binary64
           // for replacement of ldexp() which is not necessarily faster
 than good library implementation,
           // but have more robust performance
           int ee = e+1-digits_to_round_to;
           if (ee & 1) {
             onbits <<= 1;
             ee -= 1;
           }
           double ret = static_cast<Float>(static_cast<int64_t>(onbits));
           ret = original_arg.sign() ? -ret : ret;
           int eh = ee / 2;
           uint64_t u_scale = uint64_t(1023 + eh) << 52;
           double d_scale;
           memcpy(&d_scale, &u_scale, sizeof(d_scale)); // d_scale =
 2**(ee/2)
           *res = static_cast<Float>((ret * d_scale) * d_scale);
           return;
         }

         Float ret = static_cast<Float>(static_cast<int64_t>(onbits));
         ret = original_arg.sign() ? -ret : ret;
         *res = std::ldexp(ret, e+1-digits_to_round_to);
         return;
       }
    }

    //
    // Check for super large exponent that must be converted to infinity:
    //
    if(original_arg.exponent() > std::numeric_limits<Float>::max_exponent)
    {
       *res = std::numeric_limits<Float>::has_infinity ?
 std::numeric_limits<Float>::infinity() :
 (std::numeric_limits<Float>::max)();
       if(original_arg.sign())
          *res = -*res;
       return;
    }
    //
    // Figure out how many digits we will have in our result,
    // allowing for a possibly denormalized result:
    //
    common_exp_type digits_to_round_to =
 std::numeric_limits<Float>::digits;
    if(original_arg.exponent() < std::numeric_limits<Float>::min_exponent -
 1)
    {
       common_exp_type diff = original_arg.exponent();
       diff -= std::numeric_limits<Float>::min_exponent - 1;
       digits_to_round_to += diff;
    }
    if(digits_to_round_to < 0)
    {
       // Result must be zero:
       *res = 0;
       if(original_arg.sign())
          *res = -*res;
       return;
    }


    //
    // Perform rounding first, then afterwards extract the digits:
    //
    cpp_bin_float<std::numeric_limits<Float>::digits, digit_base_2,
 Allocator, Exponent, MinE, MaxE> arg;
    typename cpp_bin_float<Digits, DigitBase, Allocator, Exponent, MinE,
 MaxE>::rep_type bits(original_arg.bits());
    arg.exponent() = original_arg.exponent();
    copy_and_round(arg, bits, (int)digits_to_round_to);
    common_exp_type e = arg.exponent();
    e -= cpp_bin_float<Digits, DigitBase, Allocator, Exponent, MinE,
 MaxE>::bit_count - 1;
    static const unsigned limbs_needed = std::numeric_limits<Float>::digits
 / (sizeof(*arg.bits().limbs()) * CHAR_BIT)
       + (std::numeric_limits<Float>::digits % (sizeof(*arg.bits().limbs())
 * CHAR_BIT) ? 1 : 0);
    unsigned first_limb_needed = arg.bits().size() - limbs_needed;
    *res = 0;
    e += first_limb_needed * sizeof(*arg.bits().limbs()) * CHAR_BIT;
    while(first_limb_needed < arg.bits().size())
    {
       *res +=
 std::ldexp(static_cast<Float>(arg.bits().limbs()[first_limb_needed]),
 static_cast<int>(e));
       ++first_limb_needed;
       e += sizeof(*arg.bits().limbs()) * CHAR_BIT;
    }
    if(original_arg.sign())
       *res = -*res;
 }

 }}}

-- 
Ticket URL: <https://svn.boost.org/trac/boost/ticket/12527#comment:21>
Boost C++ Libraries <http://www.boost.org/>
Boost provides free peer-reviewed portable C++ source libraries.

This archive was generated by hypermail 2.1.7 : 2017-02-16 18:50:20 UTC