Boost logo

Boost :

Subject: Re: [boost] [review] Last day for formal review for Sort library
From: Steven Ross (spreadsort_at_[hidden])
Date: 2014-12-09 06:17:07


Dupuis,

On Fri Nov 21 2014 at 12:01:24 AM Steven Ross <spreadsort_at_[hidden]> wrote:

> Thanks for your input Dupuis.
>
> On Thu Nov 20 2014 at 10:32:46 AM DUPUIS Etienne <e.dupuis_at_[hidden]>
> wrote:
>
>>
>>
>> Another example, sorting by birth data and then by name :
>>
>> // --------------- Start of code snippet ------------------------
>> struct Id {
>> std::string name;
>> time_t birth;
>>
>> bool operator<(const Id& id) const {
>> return (birth < id.birth) || ((birth == id.birth) && (name <
>> id.name));
>> }
>> };
>
> // --------------- End of code snippet ------------------------
>> Again, I understand that I must use string_sort and split the birth date
>> into 8-bit components. Am I right ?
>>
>
> That's a good idea! I'll add such an example if the library is deemed
> worthy of including in boost.
>
>
I've added such an example to the develop branch that handles signed ints,
signed floats, and multiple string keys all together:
https://github.com/spreadsort/sort/blob/develop/example/generalizedstruct.cpp
And will mention it in the docs. Surprisingly, considering all the
complexity of its get_char function, it's about 50% faster than std::sort
on random data formatted like this. Below are the comparison and get_char
functions for you to see. Comparison sorting is definitely easier, but
hybrid-radix sorting is feasible and surprisingly efficient even for
harder-to-represent cases:

struct DATA_TYPE {
  time_t birth;
  float net_worth;
  string first_name;
  string last_name;
};

static const int birth_size = sizeof(time_t);
static const int first_name_offset = birth_size + sizeof(float);
static const boost::uint64_t base_mask = 0xff;

struct lessthan {
  inline bool operator()(const DATA_TYPE &x, const DATA_TYPE &y) const {
    if (x.birth != y.birth) {
      return x.birth < y.birth;
    }
    if (x.net_worth != y.net_worth) {
      return x.net_worth < y.net_worth;
    }
    if (x.first_name != y.first_name) {
      return x.first_name < y.first_name;
    }
    return x.last_name < y.last_name;
  }
};

struct bracket {
  inline unsigned char operator()(const DATA_TYPE &x, size_t offset) const {
    // Sort date as a signed int, returning the appropriate byte.
    if (offset < birth_size) {
      const int bit_shift = 8 * (birth_size - offset - 1);
      unsigned char result = (x.birth & (base_mask << bit_shift)) >>
bit_shift;
      // Handling the sign bit. Unnecessary if the data is always positive.
      if (offset == 0) {
        return result ^ 128;
      }

      return result;
    }

    // Sort a signed float. This requires reversing the order of negatives
    // because of the way floats are represented in bits.
    if (offset < first_name_offset) {
      const int bit_shift = 8 * (first_name_offset - offset - 1);
      unsigned key = float_mem_cast<float, unsigned>(x.net_worth);
      unsigned char result = (key & (base_mask << bit_shift)) >> bit_shift;
      // Handling the sign.
      if (x.net_worth < 0) {
        return 255 - result;
      }
      // Increasing positives so they are higher than negatives.
      if (offset == birth_size) {
        return 128 + result;
      }

      return result;
    }

    // Sort a string that is before the end. This approach supports
embedded
    // nulls. If embedded nulls are not required, then just delete the "*
2"
    // and the inside of the following if just becomes:
    // return x.first_name[offset - first_name_offset];
    const unsigned first_name_end_offset =
      first_name_offset + x.first_name.size() * 2;
    if (offset < first_name_end_offset) {
      int char_offset = offset - first_name_offset;
      // This signals that the string continues.
      if (!(char_offset & 1)) {
        return 1;
      }
      return x.first_name[char_offset >> 1];
    }

    // This signals that the string has ended, so that shorter strings come
    // before longer ones.
    if (offset == first_name_end_offset) {
      return 0;
    }

    // The final string needs no special consideration.
    return x.last_name[offset - first_name_end_offset - 1];
  }
};

struct getsize {
  inline size_t operator()(const DATA_TYPE &x) const {
    return first_name_offset + x.first_name.size() * 2 + 1 +
      x.last_name.size();
  }
};


Boost list run by bdawes at acm.org, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk