Boost logo

Boost :

From: Matt Hurd (matt.hurd_at_[hidden])
Date: 2005-01-17 04:51:00


On Mon, 17 Jan 2005 09:37:34 +0100, Matthias Troyer
<troyer_at_[hidden]> wrote:
>
> I'm very interested if this scales well for a larger number of keys, of
> the order of 10^6 - 10^8 keys. I guess that for those sizes the scaling
> will already be exponential. Can you tell us what the maximum number of
> keys is that can be efficiently treated?
>
> Matthias

Howdy,

Perfect hashing and statistically reasonable hashing are quite
different but practically the same ;-) For 2^128 keys you might like
to try something these papers allude to:

R. Uzgalis and M. Tong. Hashing myths. Technical Report 97, Department
of Computer Science University of Auckland, July 1994.

and

John Hamer. Hashing Revisited. Proceedings of the 7th annual
conference on Innovation and technology in computer science education
Aarhus, Denmark SESSION: Algorithms
Pages: 80 - 83 Year of Publication: 2002 ISBN:1-58113-499-1

I feel for R. Uzgalis though, as I think of these two papers as Hamer
and Tong ;-)

Hope this helps,

matt.
______________________________

PS: A bit of bad code to show how it works is below, feel free to use
it to start something decent if you like:

Not sure if the code works or it is a w.i.p. as it is from an odd set
of files I have on this machine. Looks about right though.

/*-------------------------------------------------------------------
        created: 2004-4-16 15:58
        filename: hash.hpp
        author: Matt Hurd
        
        purpose:
---------------------------------------------------------------------*/

#ifndef hash_hpp_2004416
#define hash_hpp_2004416

#include <boost/random.hpp>
#include <vector>

namespace net {

static unsigned char hash_byte_transform_1[256]
    = { 129, 186, 126, 139, 50, 173, 232, 9, 144, 73, 174, 153, 203, 110, 206,
        254, 150, 83, 108, 109, 249, 55, 231, 62, 115, 191, 101, 105, 252, 237,
        49, 182, 64, 37, 13, 46, 152, 239, 42, 18, 247, 195, 86, 98, 102, 215,
        227, 200, 65, 246, 80, 118, 250, 207, 214, 210, 149, 8, 241, 85, 243,
        107, 199, 188, 202, 59, 162, 25, 103, 204, 43, 168, 221, 197, 53, 253,
        234, 19, 20, 15, 242, 7, 56, 190, 233, 14, 79, 4, 163, 29, 170, 194,
        184, 72, 240, 16, 45, 176, 31, 196, 238, 235, 140, 217, 93, 39, 143,
        164, 40, 78, 99, 17, 212, 216, 166, 171, 167, 142, 224, 97, 169, 75, 48,
        159, 180, 141, 209, 104, 28, 116, 44, 255, 47, 156, 77, 58, 91, 181,
        245, 87, 0, 160, 133, 111, 26, 6, 230, 95, 70, 161, 137, 89, 68, 172,
        244, 90, 1, 223, 81, 33, 67, 30, 132, 82, 236, 100, 138, 36, 183, 251,
        165, 179, 60, 185, 66, 123, 61, 218, 125, 189, 3, 178, 92, 35, 34, 130,
        69, 198, 177, 32, 63, 208, 21, 94, 193, 134, 71, 10, 154, 96, 128, 157,
        54, 187, 120, 192, 145, 41, 11, 225, 57, 52, 131, 23, 84, 222, 74, 151,
        127, 27, 5, 219, 228, 201, 113, 76, 38, 136, 2, 88, 146, 106, 22, 147,
        213, 205, 114, 226, 229, 24, 175, 122, 124, 135, 121, 158, 220, 12, 211,
        117, 112, 51, 148, 248, 119, 155
    };

class hash
{
public:

    static unsigned char transform( unsigned char val)
    {
        return hash_byte_transform_1[val];
    }

    template< typename T >
    static std::size_t transform( const T& val_a )
    {
        union unholy
        {
            T val_;
            unsigned char byte_[sizeof(T)];
        };

        const unholy * t = reinterpret_cast<const unholy*>(&val_a);

        std::size_t result = 123456789;
        result ^= hash_byte_transform_1[t->byte_[0]];

        for (int i = 1; i < sizeof(T); ++i)
        {
            result = (result << 8)
                | (result >> (8 * (sizeof(std::size_t) - 1)));
            result ^= hash_byte_transform_1[t->byte_[i]];
        }

        return result;
    }

    template <>
    static std::size_t transform( const std::string& val )
    {
        const int size = val.length();
        std::size_t result = 123456789;

        if ( size > 0 )
        {
            std::string::const_iterator i = val.begin();
            std::string::const_iterator i_end = val.end();

            result ^= hash_byte_transform_1[ unsigned char(*i) ];

            for ( ;i != i_end; ++i)
            {
                result = (result << 8)
                    | (result >> (8 * (sizeof(std::size_t) - 1)));
                result ^= hash_byte_transform_1[ unsigned char(*i) ];
            }
        }

        return result;
    }

    static void prepare_random_list()
    {
        boost::mt19937 rng;
        rng.seed( 42 ); // the base to the meaning of life is chance
        boost::uniform_int<> byte_range(0,255);
        boost::variate_generator<boost::mt19937, boost::uniform_int<>
> die(rng, byte_range);

        std::vector<int> v;

        v.resize(256);

        // initialise
        for (std::size_t i = 0; i < 256; ++i) {
            v[i] = i;
        }

        // shuffle
        for (std::size_t i = 0; i < 256; ++i)
        {
            std::swap( v[i], v[255 - (die() % (256 - i))] );
        }

        // output results for pasting
        for (std::size_t i = 0; i < 256; ++i)
        {
            std::cout << v[i] << ", ";
        }

        std::cout << "\n";
    }
};

} // namespace

#endif


Boost list run by bdawes at acm.org, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk