Subject: Re: [Boost-bugs] [Boost C++ Libraries] #12527: cpp_bin_float: Anal fixation. Part 3. Double rounding when result of convert_to<double>() is a subnormal
From: Boost C++ Libraries (noreply_at_[hidden])
Date: 2016-11-29 22:43:55
#12527: cpp_bin_float: Anal fixation. Part 3. Double rounding when result of
convert_to<double>() is a subnormal
-------------------------------+----------------------------
Reporter: Michael Shatz | Owner: johnmaddock
Type: Bugs | Status: reopened
Milestone: To Be Determined | Component: multiprecision
Version: Boost 1.62.0 | Severity: Problem
Resolution: | Keywords:
-------------------------------+----------------------------
Comment (by Michael Shatz):
Just for fun, I coded "light-speed" variant of conversion routine, that
detects and specializes cases of 'Float' equivalent to IEEE-754 binary32
and binary64 and uses bit-level knowledge of this formats in order to
implement conversion that does not use FPU at all.
Enjoy:
{{{
template <class Float, unsigned Digits, digit_base_type DigitBase, class
Allocator, class Exponent, Exponent MinE, Exponent MaxE>
inline typename boost::enable_if_c<boost::is_float<Float>::value>::type
eval_convert_to(Float *res, const cpp_bin_float<Digits, DigitBase,
Allocator, Exponent, MinE, MaxE> &original_arg)
{
typedef cpp_bin_float<std::numeric_limits<Float>::digits, digit_base_2,
void, Exponent, MinE, MaxE> conv_type;
typedef typename common_type<typename conv_type::exponent_type,
int>::type common_exp_type;
//
// Special cases first:
//
switch(original_arg.exponent())
{
case cpp_bin_float<Digits, DigitBase, Allocator, Exponent, MinE,
MaxE>::exponent_zero:
*res = 0;
if(original_arg.sign())
*res = -*res;
return;
case cpp_bin_float<Digits, DigitBase, Allocator, Exponent, MinE,
MaxE>::exponent_nan:
*res = std::numeric_limits<Float>::quiet_NaN();
return;
case cpp_bin_float<Digits, DigitBase, Allocator, Exponent, MinE,
MaxE>::exponent_infinity:
*res = (std::numeric_limits<Float>::infinity)();
if(original_arg.sign())
*res = -*res;
return;
}
if (std::numeric_limits<Float>::radix==2) {
if (original_arg.exponent() >=
std::numeric_limits<Float>::max_exponent ||
original_arg.exponent() <
std::numeric_limits<Float>::min_exponent -
std::numeric_limits<Float>::digits -1)
{
Float ret =
original_arg.exponent() >=
std::numeric_limits<Float>::max_exponent ?
(std::numeric_limits<Float>::has_infinity ?
std::numeric_limits<Float>::infinity() :
std::numeric_limits<Float>::max() ):
0;
*res = original_arg.sign() ? -ret : ret;
return;
}
// fast handling of mundane cases
if (std::numeric_limits<Float>::digits <= 62) {
// extract 64 MS bits of significand
typename cpp_bin_float<Digits, DigitBase, Allocator, Exponent,
MinE, MaxE>::rep_type bits(original_arg.bits());
if (original_arg.bit_count > 64)
eval_right_shift(bits, original_arg.bit_count - 64);
uint64_t msbits;
eval_convert_to(&msbits, bits);
if (original_arg.bit_count < 64)
msbits <<= (64 - original_arg.bit_count) % 64;
int digits_to_round_to = std::numeric_limits<Float>::digits;
int e = original_arg.exponent();
if (e < std::numeric_limits<Float>::min_exponent-1) {
digits_to_round_to += e -
(std::numeric_limits<Float>::min_exponent-1);
}
const uint64_t offmask = uint64_t(-1) >> digits_to_round_to;
const uint64_t offmsbit = uint64_t(1) << (63-digits_to_round_to);
uint64_t offbits = msbits & offmask;
uint64_t onbits = (msbits >> 1) >> (63-digits_to_round_to);
offbits |= (onbits & 1); // tie rounded toward even
if (original_arg.bit_count > 64 && offbits == offmsbit) {
// Suspect for a tie. We have to examine lower bits of original
// The method used below is certainly not the fastest possible,
// but the code is short and does not rely on internal
representation of cpp_int
// Taking into account limits of my template-fu, that's not a
small advantage.
// I am feeling less guilty about it because in typical cases of
conversion (double/float)
// we will not arrive here too often
bits = original_arg.bits();
eval_left_shift(bits, 64);
if (!eval_eq(bits, limb_type(0)))
offbits |= 1;
}
if (((std::numeric_limits<Float>::digits==53 &&
sizeof(Float)==sizeof(uint64_t)) ||
(std::numeric_limits<Float>::digits==24 &&
sizeof(Float)==sizeof(uint32_t))) &&
std::numeric_limits<Float>::has_denorm == std::denorm_present
&&
std::numeric_limits<Float>::is_iec559)
{
// 'Float' is IEEE 754 binary64 or IEEE 754 binary32
// exploit knowledge of bit-level representation of IEEE 754
binaryXX
// for extremely fast replacement of ldexp()
if (e < std::numeric_limits<Float>::min_exponent-1)
e = std::numeric_limits<Float>::min_exponent-2;
onbits &= uint64_t(-1) >>
(65-std::numeric_limits<Float>::digits); // remove hidden bit
onbits += (offbits > offmsbit);
const int exp_bias = std::numeric_limits<Float>::digits==24 ?
127 : 1023;
onbits += uint64_t(exp_bias+e) <<
(std::numeric_limits<Float>::digits-1); // '+' rather than '|' , so case
of carry from significand into exponent handled automatically
onbits |= uint64_t(original_arg.sign() ? 1 : 0) <<
(std::numeric_limits<Float>::digits==24 ? 31 : 63);
if (sizeof(Float)==sizeof(uint32_t)) {
uint32_t u32 = static_cast<uint32_t>(onbits);
memcpy(res, &u32, sizeof(*res));
} else {
memcpy(res, &onbits, sizeof(*res));
}
return;
}
onbits += (offbits > offmsbit);
if (std::numeric_limits<double>::radix==2 &&
sizeof(uint64_t)==sizeof(double) &&
std::numeric_limits<double>::digits==53 &&
std::numeric_limits<double>::has_denorm == std::denorm_present
&&
std::numeric_limits<double>::is_iec559 && // platform's
'double' is IEEE 754 binary64
std::numeric_limits<Float>::digits <=
std::numeric_limits<double>::digits &&
std::numeric_limits<Float>::min_exponent >=
std::numeric_limits<double>::min_exponent &&
std::numeric_limits<Float>::max_exponent <=
std::numeric_limits<double>::max_exponent) {
// 'Float' is a subset of 'double'
// exploit knowledge of bit-level representation of IEEE 754
binary64
// for replacement of ldexp() which is not necessarily faster
than good library implementation,
// but have more robust performance
int ee = e+1-digits_to_round_to;
if (ee & 1) {
onbits <<= 1;
ee -= 1;
}
double ret = static_cast<Float>(static_cast<int64_t>(onbits));
ret = original_arg.sign() ? -ret : ret;
int eh = ee / 2;
uint64_t u_scale = uint64_t(1023 + eh) << 52;
double d_scale;
memcpy(&d_scale, &u_scale, sizeof(d_scale)); // d_scale =
2**(ee/2)
*res = static_cast<Float>((ret * d_scale) * d_scale);
return;
}
Float ret = static_cast<Float>(static_cast<int64_t>(onbits));
ret = original_arg.sign() ? -ret : ret;
*res = std::ldexp(ret, e+1-digits_to_round_to);
return;
}
}
//
// Check for super large exponent that must be converted to infinity:
//
if(original_arg.exponent() > std::numeric_limits<Float>::max_exponent)
{
*res = std::numeric_limits<Float>::has_infinity ?
std::numeric_limits<Float>::infinity() :
(std::numeric_limits<Float>::max)();
if(original_arg.sign())
*res = -*res;
return;
}
//
// Figure out how many digits we will have in our result,
// allowing for a possibly denormalized result:
//
common_exp_type digits_to_round_to =
std::numeric_limits<Float>::digits;
if(original_arg.exponent() < std::numeric_limits<Float>::min_exponent -
1)
{
common_exp_type diff = original_arg.exponent();
diff -= std::numeric_limits<Float>::min_exponent - 1;
digits_to_round_to += diff;
}
if(digits_to_round_to < 0)
{
// Result must be zero:
*res = 0;
if(original_arg.sign())
*res = -*res;
return;
}
//
// Perform rounding first, then afterwards extract the digits:
//
cpp_bin_float<std::numeric_limits<Float>::digits, digit_base_2,
Allocator, Exponent, MinE, MaxE> arg;
typename cpp_bin_float<Digits, DigitBase, Allocator, Exponent, MinE,
MaxE>::rep_type bits(original_arg.bits());
arg.exponent() = original_arg.exponent();
copy_and_round(arg, bits, (int)digits_to_round_to);
common_exp_type e = arg.exponent();
e -= cpp_bin_float<Digits, DigitBase, Allocator, Exponent, MinE,
MaxE>::bit_count - 1;
static const unsigned limbs_needed = std::numeric_limits<Float>::digits
/ (sizeof(*arg.bits().limbs()) * CHAR_BIT)
+ (std::numeric_limits<Float>::digits % (sizeof(*arg.bits().limbs())
* CHAR_BIT) ? 1 : 0);
unsigned first_limb_needed = arg.bits().size() - limbs_needed;
*res = 0;
e += first_limb_needed * sizeof(*arg.bits().limbs()) * CHAR_BIT;
while(first_limb_needed < arg.bits().size())
{
*res +=
std::ldexp(static_cast<Float>(arg.bits().limbs()[first_limb_needed]),
static_cast<int>(e));
++first_limb_needed;
e += sizeof(*arg.bits().limbs()) * CHAR_BIT;
}
if(original_arg.sign())
*res = -*res;
}
}}}
-- Ticket URL: <https://svn.boost.org/trac/boost/ticket/12527#comment:21> Boost C++ Libraries <http://www.boost.org/> Boost provides free peer-reviewed portable C++ source libraries.
This archive was generated by hypermail 2.1.7 : 2017-02-16 18:50:20 UTC