Boost logo

Boost-Commit :

From: arseny.kapoulkine_at_[hidden]
Date: 2007-08-02 17:02:50


Author: zeux
Date: 2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
New Revision: 38408
URL: http://svn.boost.org/trac/boost/changeset/38408

Log:
FFT improvements & refactoring
Added:
   sandbox/SOC/2007/bigint/fft/primes.h (contents, props changed)
Removed:
   sandbox/SOC/2007/bigint/fft/main_slow.cpp
   sandbox/SOC/2007/bigint/fft/main_word.cpp
   sandbox/SOC/2007/bigint/fft/main_word_fast.cpp
   sandbox/SOC/2007/bigint/fft/main_word_weird.cpp
   sandbox/SOC/2007/bigint/fft/sieve.cpp
Text files modified:
   sandbox/SOC/2007/bigint/fft/main.cpp | 374 +++++++++++++++++++--------------------
   sandbox/SOC/2007/bigint/fft/roots.cpp | 31 +++
   2 files changed, 211 insertions(+), 194 deletions(-)

Modified: sandbox/SOC/2007/bigint/fft/main.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/main.cpp (original)
+++ sandbox/SOC/2007/bigint/fft/main.cpp 2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
@@ -1,58 +1,48 @@
 #include <iostream>
-
+#include <fstream>
 #include <windows.h>
 
-const int base = 10;
-typedef short limb_t;
+#include "primes.h"
 
-struct print
+template <typename T> void P(const char* text, const T* data, size_t size)
 {
- const limb_t* _data;
- size_t _size;
-
- print(const limb_t* data, size_t size): _data(data), _size(size)
- {
- }
+ std::cout << text;
+ for (size_t i = 0; i < size; ++i) std::cout << " " << std::dec << data[i] << std::dec;
+ std::cout << std::endl;
+}
 
- std::ostream& operator()(std::ostream& os) const
- {
- for (const limb_t* i = _data; i != _data + _size; ++i)
- os << *i;
-
- return os;
- }
-};
+template <typename T> void S(const char* file, const T* data, size_t size)
+{
+ std::ofstream out(file);
+ for (size_t i = 0; i < size; ++i) out << std::hex << data[i] << std::dec;
+}
 
-template <typename T> void P(const char* text, const T* data, size_t size)
+unsigned int modmul(unsigned int a, unsigned int b, unsigned int prime)
 {
- std::cout << text << ": ";
- for (size_t i = 0; i < size; ++i) std::cout << data[i] << " ";
- std::cout << std::endl;
+ return static_cast<unsigned int>(static_cast<unsigned __int64>(a) * b % prime);
 }
 
-int pow(int base, int power, int prime)
+unsigned int modadd(unsigned int a, unsigned int b, unsigned int prime)
 {
- int pot = power;
+ return (a + b) % prime;
+}
 
- // Find largest power of two that is >= rhs
- while (pot < power && (pot << 1) != 0)
- pot <<= 1;
+unsigned int modsub(unsigned int a, unsigned int b, unsigned int prime)
+{
+ return (a + prime - b) % prime;
+}
 
- int res = 1;
+unsigned int log2(unsigned int value)
+{
+ unsigned int r = 0;
 
- while (pot > 0)
+ while (value > 1)
         {
- res = (res * res) % prime;
-
- if ((power & pot) != 0)
- {
- res = (res * base) % prime;
- }
-
- pot >>= 1;
+ ++r;
+ value >>= 1;
         }
 
- return res;
+ return r;
 }
 
 // reverse(x) -> reverse(x+1)
@@ -69,20 +59,6 @@
 }
 
 // swaps values with indices that have reversed bit representation (data[1100b] <-> data[0011b])
-void fft_reorder(unsigned int* data, size_t size)
-{
- if (size <= 2) return;
-
- size_t r = 0;
-
- for (size_t x = 1; x < size; ++x)
- {
- r = rev_next(r, size);
-
- if (r > x) std::swap(data[x], data[r]);
- }
-}
-
 template <typename T> void fft_copy_reorder(unsigned int* dest, const T* source, size_t size)
 {
         if (size <= 2)
@@ -103,46 +79,40 @@
         }
 }
 
-void fft_recursive(unsigned int* data, size_t size, int prime, int root)
+void fft_recursive(unsigned int* data, size_t size, unsigned int prime, const unsigned int* roots)
 {
         size /= 2;
 
         if (size > 1)
         {
- int new_root = (root * root) % prime;
- fft_recursive(data, size, prime, new_root);
- fft_recursive(data + size, size, prime, new_root);
+ fft_recursive(data, size, prime, roots - 1);
+ fft_recursive(data + size, size, prime, roots - 1);
         }
 
+ unsigned int root = *roots;
         unsigned int r = 1;
 
         for (size_t i = 0; i < size; ++i)
         {
                 unsigned int a = data[i];
- unsigned int b = (data[i + size] * r) % prime;
+ unsigned int b = modmul(data[i + size], r, prime);
 
- data[i] = (a + b) % prime;
- data[i + size] = (a + prime - b) % prime;
+ data[i] = modadd(a, b, prime);
+ data[i + size] = modsub(a, b, prime);
 
- r = (r * root) % prime;
+ r = modmul(r, root, prime);
         }
 }
 
-void fft_iterative(unsigned int* data, size_t size, int prime, int root)
+void fft_iterative(unsigned int* data, size_t size, unsigned int prime, const unsigned int* roots)
 {
         size_t step = 1;
 
- int roots[4];
- roots[3] = root;
- roots[2] = (roots[3] * roots[3]) % prime;
- roots[1] = (roots[2] * roots[2]) % prime;
- roots[0] = (roots[1] * roots[1]) % prime;
-
- int nstep = 0;
+ unsigned int nstep = 0;
 
         while (step < size)
         {
- root = roots[nstep++];
+ unsigned int root = roots[++nstep];
 
                 size_t half_step = step;
                 step *= 2;
@@ -154,208 +124,210 @@
                         for (size_t i = j; i < size; i += step)
                         {
                                 unsigned int a = data[i];
- unsigned int b = (data[i + half_step] * r) % prime;
+ unsigned int b = modmul(data[i + half_step], r, prime);
 
- data[i] = (a + b) % prime;
- data[i + half_step] = (a + prime - b) % prime;
+ data[i] = modadd(a, b, prime);
+ data[i + half_step] = modsub(a, b, prime);
                         }
                         
- r = (r * root) % prime;
+ r = modmul(r, root, prime);
                 }
         }
 }
 
-void fft(unsigned int* dest, const limb_t* source, size_t N, int prime, int root)
+void dft(unsigned int* dest, const unsigned short* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table)
 {
         fft_copy_reorder(dest, source, N);
- fft_recursive(dest, N, prime, root);
+ fft_recursive(dest, N, prime, root_table + log2N);
 }
 
-void fft_i(unsigned int* dest, const limb_t* source, size_t N, int prime, int root)
+void dft_i(unsigned int* dest, const unsigned short* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table)
 {
         fft_copy_reorder(dest, source, N);
- fft_iterative(dest, N, prime, root);
+ fft_iterative(dest, N, prime, root_table);
 }
 
-void fft_slow(unsigned int* dest, const limb_t* source, size_t N, int prime, int root)
+void ift(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
 {
- unsigned int p_j = 1;
-
- for (int j = 0; j < N; ++j)
- {
- unsigned int r = 0;
-
- unsigned int p_jk = 1;
-
- for (int k = 0; k < N; ++k)
- {
- r = (r + source[k] * p_jk) % prime;
- p_jk = (p_jk * p_j) % prime;
- }
-
- dest[j] = r;
-
- p_j = (p_j * root) % prime;
- }
-}
-
-void ift(unsigned int* dest, const unsigned int* source, size_t N, int prime, int root)
-{
- int rroot = pow(root, prime - 2, prime);
-
         fft_copy_reorder(dest, source, N);
- fft_recursive(dest, N, prime, rroot);
-
- unsigned int N_inv = pow(N, prime - 2, prime);
+ fft_recursive(dest, N, prime, root_table + log2N);
 
         for (size_t i = 0; i < N; ++i)
         {
- dest[i] = dest[i] * N_inv % prime;
+ dest[i] = modmul(dest[i], inv_N, prime);
         }
 }
 
-void ift_i(unsigned int* dest, const unsigned int* source, size_t N, int prime, int root)
+void ift_i(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
 {
- int rroot = pow(root, prime - 2, prime);
-
         fft_copy_reorder(dest, source, N);
- fft_iterative(dest, N, prime, rroot);
-
- unsigned int N_inv = pow(N, prime - 2, prime);
+ fft_iterative(dest, N, prime, root_table);
 
         for (size_t i = 0; i < N; ++i)
         {
- dest[i] = dest[i] * N_inv % prime;
+ dest[i] = modmul(dest[i], inv_N, prime);
         }
 }
 
-void ift_slow(unsigned int* dest, const unsigned int* source, size_t N, int prime, int root)
+void fft_crt_carry(unsigned short* dest, const unsigned int* conv0, const unsigned int* conv1, size_t N)
 {
- int rroot = pow(root, prime - 2, prime);
- int N_inv = pow(N, prime - 2, prime);
+ unsigned __int64 prime = static_cast<unsigned __int64>(fft_primes[0]) * fft_primes[1];
+ unsigned int inv_p0_mod_p1 = fft_inv_p0_mod_p1;
 
- unsigned int p_j = 1;
+ unsigned __int64 carry = 0;
 
- for (int j = 0; j < N; ++j)
- {
- unsigned int p_jk = 1;
+ for (int i = 0; i < N; ++i)
+ {
+ // 64 x == 32 c0 mod 32 p0
+ // 64 x == 32 c1 mod 32 p1
 
- unsigned int r = 0;
+ // x == p0*t + c0
+ // p0*t + c0 == c1 mod p1
+ // t == (c1 - c0) div p0 mod p1
 
- for (int k = 0; k < N; ++k)
- {
- r = (r + source[k] * p_jk) % prime;
+ // t == t0 mod p1
 
- p_jk = (p_jk * p_j) % prime;
- }
+ // x == p0 * t0 + c0
+ // t0 == (c1 - c0) div p0 mod p1
 
- r = (r * N_inv) % prime;
+ unsigned int t0 = modmul(modsub(conv1[i], conv0[i], fft_primes[1]), inv_p0_mod_p1, fft_primes[1]);
+ unsigned __int64 x = static_cast<unsigned __int64>(fft_primes[0]) * t0 + conv0[i];
 
- dest[j] = r;
-
- p_j = (p_j * rroot) % prime;
+ carry += x;
+
+ dest[i] = carry & 0xffff;
+ carry >>= 16;
         }
 }
 
-typedef void (*fftfunc)(unsigned int* dest, const limb_t* source, size_t N, int prime, int root);
-typedef void (*iftfunc)(unsigned int* dest, const unsigned int* source, size_t N, int prime, int root);
-
-void mul_fft(limb_t* dest, const limb_t* a, const limb_t* b, fftfunc fft, iftfunc ift)
+// precondition: dest has N elements allocated, a and b have N elements, last N/2 are 0, N is power of two
+// postcondition: dest has product, padded with zeroes
+void mul_fft(unsigned short* dest, const unsigned short* a, const unsigned short* b, size_t N,
+ void (*dft)(unsigned int* dest, const unsigned short* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table),
+ void (*ift)(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
+ )
 {
- // p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
- const size_t N = 16;
+ unsigned int log2N = log2(N);
         
- // N * base^2 == 1600
- // prime0 * prime1 must be > 1600
- const int primes[] = {17, 97};
-
- // primitive root (power N)
- const int roots[] = {3, 8};
-
- unsigned int convs[2][N];
+ unsigned int* workspace = new unsigned int[4*N];
+
+ unsigned int* convs[] = {workspace, workspace + N};
+ unsigned int* fft_a = workspace + 2*N;
+ unsigned int* fft_b = workspace + 3*N;
 
         for (int p = 0; p < 2; ++p)
         {
- unsigned int fft_a[N], fft_b[N], fft_c[N];
- fft(fft_a, a, N, primes[p], roots[p]);
- fft(fft_b, b, N, primes[p], roots[p]);
+ const unsigned int* root_table = fft_primitive_roots[p];
+ unsigned int prime = fft_primes[p];
+
+ dft(fft_a, a, N, log2N, prime, root_table);
+ dft(fft_b, b, N, log2N, prime, root_table);
 
                 for (size_t i = 0; i < N; ++i)
                 {
- fft_c[i] = (fft_a[i] * fft_b[i]) % primes[p];
+ fft_a[i] = modmul(fft_a[i], fft_b[i], prime);
                 }
+
+ const unsigned int* inv_root_table = fft_inv_primitive_roots[p];
+ unsigned int inv_N = fft_inv_N[p][log2N];
 
- ift(convs[p], fft_c, N, primes[p], roots[p]);
+ ift(convs[p], fft_a, N, log2N, prime, inv_root_table, inv_N);
         }
 
- // CRT:
- // x % p1 = r1
- // x % p2 = r2
+ fft_crt_carry(dest, convs[0], convs[1], N);
 
- // x % p1p2 = r1 * (P/p1)*T1 + r2 * (P/p2)*T2
- // T1 = (P/p1) ^ (p1 - 2)
- // T2 = (P/p2) ^ (p2 - 2)
+ delete[] workspace;
+}
 
- int prime = primes[0] * primes[1];
- int T[2] = {pow(primes[1], primes[0] - 2, primes[0]), pow(primes[0], primes[1] - 2, primes[1])};
+// precondition: dest has N elements allocated, a has N elements, last N/2 are 0, N is power of two
+// postcondition: dest has product, padded with zeroes
+void sqr_fft(unsigned short* dest, const unsigned short* a, size_t N,
+ void (*dft)(unsigned int* dest, const unsigned short* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table),
+ void (*ift)(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
+ )
+{
+ unsigned int log2N = log2(N);
 
- unsigned int carry = 0;
+ unsigned int* workspace = new unsigned int[3*N];
+
+ unsigned int* convs[] = {workspace, workspace + N};
+ unsigned int* fft = workspace + 2*N;
 
- for (int i = 0; i < N; ++i)
+ for (int p = 0; p < 2; ++p)
         {
- int conv = (convs[0][i] * T[0] * primes[1] + convs[1][i] * T[1] * primes[0]) % prime;
-
- carry += conv;
- dest[i] = carry % 10;
- carry /= 10;
+ const unsigned int* root_table = fft_primitive_roots[p];
+ unsigned int prime = fft_primes[p];
+
+ dft(fft, a, N, log2N, prime, root_table);
+
+ for (size_t i = 0; i < N; ++i)
+ {
+ fft[i] = modmul(fft[i], fft[i], prime);
+ }
+
+ const unsigned int* inv_root_table = fft_inv_primitive_roots[p];
+ unsigned int inv_N = fft_inv_N[p][log2N];
+
+ ift(convs[p], fft, N, log2N, prime, inv_root_table, inv_N);
         }
+
+ fft_crt_carry(dest, convs[0], convs[1], N);
+
+ delete[] workspace;
 }
 
-void mul_basecase(limb_t* dest, const limb_t* a, const limb_t* b)
+void mul_basecase(unsigned short* dest, const unsigned short* a, const unsigned short* b, size_t N)
 {
- typedef int limb2_t;
+ typedef unsigned int limb2_t;
 
- const size_t N = 16;
+ memset(dest, 0, sizeof(unsigned short) * N);
 
- memset(dest, 0, sizeof(limb_t) * N);
-
- limb_t* i = dest;
+ unsigned short* i = dest;
                         
- for (const limb_t* li = a; li != a + N/2; ++li, ++i)
+ for (const unsigned short* li = a; li != a + N/2; ++li, ++i)
         {
- limb_t carry = 0;
+ unsigned short carry = 0;
 
- limb_t* ci = i;
+ unsigned short* ci = i;
 
- for (const limb_t* ri = b; ri != b + N/2; ++ri)
+ for (const unsigned short* ri = b; ri != b + N/2; ++ri)
                 {
                         limb2_t result = static_cast<limb2_t>(*li) * *ri + *ci + carry;
 
- *ci++ = static_cast<limb_t>(result % 10);
+ *ci++ = static_cast<unsigned short>(result & 0xffff);
 
- carry = static_cast<limb_t>(result / 10);
+ carry = static_cast<unsigned short>(result >> 16);
                 }
 
                 while (carry != 0)
                 {
                         limb2_t result = static_cast<limb2_t>(*ci) + carry;
                                         
- *ci++ = static_cast<limb_t>(result % 10);
+ *ci++ = static_cast<unsigned short>(result & 0xffff);
 
- carry = static_cast<limb_t>(result / 10);
+ carry = static_cast<unsigned short>(result >> 16);
                 }
         }
 }
 
 int main()
 {
- limb_t a[16] = {2, 3, 4, 5, 6, 7, 8, 9};
- limb_t b[16] = {1, 2, 3, 4, 5, 6, 7, 8};
+ size_t bits = (1 << 20) * 16;
+ size_t N = bits / 16;
 
- limb_t c[16] = {};
+ unsigned short* a = new unsigned short[N];
+ unsigned short* b = new unsigned short[N];
+ unsigned short* c = new unsigned short[N];
 
- std::cout << "A = "; print(a, 16)(std::cout) << std::endl;
- std::cout << "B = "; print(b, 16)(std::cout) << std::endl;
+ for (size_t i = 0; i < N/2; ++i)
+ {
+ a[i] = b[i] = 65535;
+ }
+
+ for (size_t i = N/2; i < N; ++i)
+ {
+ a[i] = b[i] = 0;
+ }
         
         int count = 1;
         LARGE_INTEGER start, end, freq;
@@ -364,29 +336,45 @@
 
         QueryPerformanceCounter(&start);
         for (int i = 0; i < count; ++i)
- mul_fft(c, a, b, fft, ift);
+ mul_fft(c, a, b, N, dft, ift);
         QueryPerformanceCounter(&end);
-
- std::cout << "Recursive FFT: C = "; print(c, 16)(std::cout) << " " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+
+ std::cout << "Recursive: " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+ S("res_fft_rec.txt", c, N);
 
         QueryPerformanceCounter(&start);
         for (int i = 0; i < count; ++i)
- mul_fft(c, a, b, fft_i, ift_i);
+ mul_fft(c, a, b, N, dft_i, ift_i);
+ QueryPerformanceCounter(&end);
+
+ std::cout << "Iterative: " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+ S("res_fft_iter.txt", c, N);
+
+ QueryPerformanceCounter(&start);
+ for (int i = 0; i < count; ++i)
+ sqr_fft(c, a, N, dft, ift);
         QueryPerformanceCounter(&end);
         
- std::cout << "Iterative FFT: C = "; print(c, 16)(std::cout) << " " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+ std::cout << "Recursive SQR: " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+ S("res_fft_rec_sqr.txt", c, N);
         
         QueryPerformanceCounter(&start);
         for (int i = 0; i < count; ++i)
- mul_fft(c, a, b, fft_slow, ift_slow);
+ sqr_fft(c, a, N, dft_i, ift_i);
         QueryPerformanceCounter(&end);
         
- std::cout << "Slow FT: C = "; print(c, 16)(std::cout) << " " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
+ std::cout << "Iterative SQR: " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+ S("res_fft_iter_sqr.txt", c, N);
+
         QueryPerformanceCounter(&start);
         for (int i = 0; i < count; ++i)
- mul_basecase(c, a, b);
+ mul_basecase(c, a, b, N);
         QueryPerformanceCounter(&end);
         
- std::cout << "Basecase: C = "; print(c, 16)(std::cout) << " " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+ std::cout << "Basecase: " << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
+ S("res_basecase.txt", c, N);
+
+ delete[] a;
+ delete[] b;
+ delete[] c;
 }

Deleted: sandbox/SOC/2007/bigint/fft/main_slow.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/main_slow.cpp 2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
+++ (empty file)
@@ -1,155 +0,0 @@
-#include <iostream>
-
-const int base = 10;
-typedef short limb_t;
-
-struct print
-{
- const limb_t* _data;
- size_t _size;
-
- print(const limb_t* data, size_t size): _data(data), _size(size)
- {
- }
-
- std::ostream& operator()(std::ostream& os) const
- {
- for (const limb_t* i = _data; i != _data + _size; ++i)
- os << *i;
-
- return os;
- }
-};
-
-int pow(int base, int power, int prime)
-{
- int res = 1;
-
- for (int i = 0; i < power; ++i)
- {
- res = (res * base) % prime;
- }
-
- return res;
-}
-
-void fft(unsigned int* dest, const limb_t* source, size_t N, int prime, int root)
-{
- for (int j = 0; j < N; ++j)
- {
- unsigned int r = 0;
-
- for (int k = 0; k < N; ++k)
- {
- r = (r + ((source[k] * pow(root, j*k, prime)) % prime)) % prime;
- }
-
- dest[j] = r;
- }
-}
-
-void ift(unsigned int* dest, const unsigned int* source, size_t N, int prime, int root)
-{
- int rroot = pow(root, prime - 2, prime);
-
- for (int j = 0; j < N; ++j)
- {
- unsigned int r = 0;
-
- for (int k = 0; k < N; ++k)
- {
- r = (r + ((source[k] * pow(rroot, j*k, prime)) % prime)) % prime;
- }
-
- r = (r * pow(N, prime - 2, prime)) % prime;
-
- dest[j] = r;
- }
-}
-
-void mul_fft(limb_t* dest, const limb_t* a, const limb_t* b)
-{
- // p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
- const size_t N = 16;
-
- // N * base^2 == 1600
- // prime0 * prime1 must be > 1600
- const int primes[] = {17, 97};
-
- // primitive root (power N)
- const int roots[] = {3, 8};
-
- unsigned int convs[2][N];
-
- for (int p = 0; p < 2; ++p)
- {
- unsigned int fft_a[N], fft_b[N], fft_c[N];
- fft(fft_a, a, N, primes[p], roots[p]);
- fft(fft_b, b, N, primes[p], roots[p]);
-
- for (size_t i = 0; i < N; ++i)
- {
- fft_c[i] = (fft_a[i] * fft_b[i]) % primes[p];
- }
-
- ift(convs[p], fft_c, N, primes[p], roots[p]);
- }
-
- unsigned int conv[N];
-
- // CRT:
- // x % p1 = r1
- // x % p2 = r2
-
- // x % p1p2 = r1 * (P/p1)*T1 + r2 * (P/p2)*T2
- // T1 = (P/p1) ^ (p1 - 2)
- // T2 = (P/p2) ^ (p2 - 2)
-
- int prime = primes[0] * primes[1];
- int T[2] = {pow(primes[1], primes[0] - 2, primes[0]), pow(primes[0], primes[1] - 2, primes[1])};
-
- for (int i = 0; i < N; ++i)
- {
- conv[i] = (convs[0][i] * T[0] * primes[1] + convs[1][i] * T[1] * primes[0]) % prime;
- }
-
- std::cout << std::endl;
- for (int i = 0; i < N; ++i)
- {
- std::cout << convs[0][i] << " ";
- }
- std::cout << std::endl;
- for (int i = 0; i < N; ++i)
- {
- std::cout << convs[1][i] << " ";
- }
- std::cout << std::endl;
- for (int i = 0; i < N; ++i)
- {
- std::cout << conv[i] << " ";
- }
- std::cout << std::endl;
-
- unsigned int carry = 0;
-
- for (int i = 0; i < N; ++i)
- {
- carry += conv[i];
- dest[i] = carry % 10;
- carry /= 10;
- }
-}
-
-int main()
-{
- limb_t a[16] = {2, 3, 4, 5, 6, 7, 8, 9};
- limb_t b[16] = {1, 2, 3, 4, 5, 6, 7, 8};
-
- limb_t c[16] = {};
-
- mul_fft(c, a, b);
-
- std::cout << "A = "; print(a, 16)(std::cout) << std::endl;
- std::cout << "B = "; print(b, 16)(std::cout) << std::endl;
- std::cout << "C = "; print(c, 16)(std::cout) << std::endl;
-}

Deleted: sandbox/SOC/2007/bigint/fft/main_word.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/main_word.cpp 2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
+++ (empty file)
@@ -1,260 +0,0 @@
-#include <iostream>
-
-#include <windows.h>
-
-#define __int64 long long
-
-const unsigned int base = 65536;
-typedef unsigned short limb_t;
-
-template <typename T> void P(const char* text, const T* data, size_t size)
-{
- std::cout << text;
- for (size_t i = 0; i < size; ++i) std::cout << " " << std::dec << data[i] << std::dec;
- std::cout << std::endl;
-}
-
-unsigned int modmul(unsigned int a, unsigned int b, unsigned int prime)
-{
- return static_cast<unsigned int>(static_cast<unsigned __int64>(a) * b % prime);
-}
-
-unsigned int modadd(unsigned int a, unsigned int b, unsigned int prime)
-{
- return (a + b) % prime;
-}
-
-unsigned int modsub(unsigned int a, unsigned int b, unsigned int prime)
-{
- return (a + prime - b) % prime;
-}
-
-unsigned int pow(unsigned int base, unsigned int power, unsigned int prime)
-{
- unsigned int pot = 1;
-
- // Find largest power of two that is >= rhs
- while (pot < power && (pot << 1) != 0)
- pot <<= 1;
-
- unsigned int res = 1;
-
- while (pot > 0)
- {
- res = modmul(res, res, prime);
-
- if ((power & pot) != 0)
- {
- res = modmul(res, base, prime);
- }
-
- pot >>= 1;
- }
-
- return res;
-}
-
-// reverse(x) -> reverse(x+1)
-size_t rev_next(size_t x, size_t n)
-{
- do
- {
- n >>= 1;
- x ^= n;
- }
- while ((x & n) == 0);
-
- return x;
-}
-
-// swaps values with indices that have reversed bit representation (data[1100b] <-> data[0011b])
-template <typename T> void fft_copy_reorder(unsigned int* dest, const T* source, size_t size)
-{
- if (size <= 2)
- {
- std::copy(source, source + size, dest);
- return;
- }
-
- size_t r = 0;
-
- dest[0] = source[0];
-
- for (size_t x = 1; x < size; ++x)
- {
- r = rev_next(r, size);
-
- dest[x] = source[r];
- }
-}
-
-void fft_recursive(unsigned int* data, size_t size, unsigned int prime, unsigned int root)
-{
- size /= 2;
-
- if (size > 1)
- {
- unsigned int new_root = modmul(root, root, prime);
- fft_recursive(data, size, prime, new_root);
- fft_recursive(data + size, size, prime, new_root);
- }
-
- unsigned int r = 1;
-
- for (size_t i = 0; i < size; ++i)
- {
- unsigned int a = data[i];
- unsigned int b = modmul(data[i + size], r, prime);
-
- data[i] = modadd(a, b, prime);
- data[i + size] = modsub(a, b, prime);
-
- r = modmul(r, root, prime);
- }
-}
-
-void fft(unsigned int* dest, const limb_t* source, size_t N, unsigned int prime, unsigned int root)
-{
- fft_copy_reorder(dest, source, N);
- fft_recursive(dest, N, prime, root);
-}
-
-void ift(unsigned int* dest, const unsigned int* source, size_t N, unsigned int prime, unsigned int root)
-{
- unsigned int rroot = pow(root, prime - 2, prime);
-
- fft_copy_reorder(dest, source, N);
- fft_recursive(dest, N, prime, rroot);
-
- unsigned int N_inv = pow(N, prime - 2, prime);
-
- for (size_t i = 0; i < N; ++i)
- {
- dest[i] = modmul(dest[i], N_inv, prime);
- }
-}
-
-void mul_fft(limb_t* dest, const limb_t* a, const limb_t* b)
-{
- // p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
- const size_t N = 16;
-
- // N * base^2 == 1600
- // prime0 * prime1 must be > 1600
- const unsigned int primes[] = {70254593, 81788929};
-
- // primitive root (power N)
- const unsigned int roots[] = {3183438, 22821826};
-
- unsigned int convs[2][N];
-
- for (int p = 0; p < 2; ++p)
- {
- unsigned int fft_a[N], fft_b[N], fft_c[N];
- fft(fft_a, a, N, primes[p], roots[p]);
- fft(fft_b, b, N, primes[p], roots[p]);
-
- for (size_t i = 0; i < N; ++i)
- {
- fft_c[i] = modmul(fft_a[i], fft_b[i], primes[p]);
- }
-
- ift(convs[p], fft_c, N, primes[p], roots[p]);
- }
-
- unsigned __int64 prime = static_cast<unsigned __int64>(primes[0]) * primes[1];
- unsigned int inv_p0_mod_p1 = pow(primes[0], primes[1] - 2, primes[1]);
-
- unsigned __int64 carry = 0;
-
- for (int i = 0; i < N; ++i)
- {
- // 64 x == 32 c0 mod 32 p0
- // 64 x == 32 c1 mod 32 p1
-
- // x == p0*t + c0
- // p0*t + c0 == c1 mod p1
- // t == (c1 - c0) div p0 mod p1
-
- // t == t0 mod p1
-
- // x == p0 * t0 + c0
- // t0 == (c1 - c0) div p0 mod p1
-
- unsigned int t0 = modmul(modsub(convs[1][i], convs[0][i], primes[1]), inv_p0_mod_p1, primes[1]);
- unsigned __int64 x = static_cast<unsigned __int64>(primes[0]) * t0 + convs[0][i];
-
- carry += x;
-
- dest[i] = carry & 0xffff;
- carry >>= 16;
- }
-}
-
-void mul_basecase(limb_t* dest, const limb_t* a, const limb_t* b)
-{
- typedef unsigned int limb2_t;
-
- const size_t N = 16;
-
- memset(dest, 0, sizeof(limb_t) * N);
-
- limb_t* i = dest;
-
- for (const limb_t* li = a; li != a + N/2; ++li, ++i)
- {
- limb_t carry = 0;
-
- limb_t* ci = i;
-
- for (const limb_t* ri = b; ri != b + N/2; ++ri)
- {
- limb2_t result = static_cast<limb2_t>(*li) * *ri + *ci + carry;
-
- *ci++ = static_cast<limb_t>(result & 0xffff);
-
- carry = static_cast<limb_t>(result >> 16);
- }
-
- while (carry != 0)
- {
- limb2_t result = static_cast<limb2_t>(*ci) + carry;
-
- *ci++ = static_cast<limb_t>(result & 0xffff);
-
- carry = static_cast<limb_t>(result >> 16);
- }
- }
-}
-
-int main()
-{
-// limb_t a[16] = {65535, 65533, 65531, 65529, 65527, 65525, 65523, 65521};
-// limb_t b[16] = {65535, 65534, 65533, 65532, 65531, 65530, 65529, 65528};
- limb_t a[16] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
- limb_t b[16] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-
- limb_t c[16] = {};
-
- P("A = ", a, 8);
- P("B = ", b, 8);
-
- int count = 10000;
- LARGE_INTEGER start, end, freq;
-
- QueryPerformanceFrequency(&freq);
-
- QueryPerformanceCounter(&start);
- for (int i = 0; i < count; ++i)
- mul_fft(c, a, b);
- QueryPerformanceCounter(&end);
-
- P("Fast FT : C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
- QueryPerformanceCounter(&start);
- for (int i = 0; i < count; ++i)
- mul_basecase(c, a, b);
- QueryPerformanceCounter(&end);
-
- P("Basecase: C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-}

Deleted: sandbox/SOC/2007/bigint/fft/main_word_fast.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/main_word_fast.cpp 2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
+++ (empty file)
@@ -1,375 +0,0 @@
-#include <iostream>
-#include <windows.h>
-
-#define __int64 long long
-
-#include "primes.h"
-
-const unsigned int base = 65536;
-typedef unsigned short limb_t;
-
-template <typename T> void P(const char* text, const T* data, size_t size)
-{
- std::cout << text;
- for (size_t i = 0; i < size; ++i) std::cout << " " << std::dec << data[i] << std::dec;
- std::cout << std::endl;
-}
-
-unsigned int modmul(unsigned int a, unsigned int b, unsigned int prime)
-{
- return static_cast<unsigned int>(static_cast<unsigned __int64>(a) * b % prime);
-}
-
-unsigned int modadd(unsigned int a, unsigned int b, unsigned int prime)
-{
- return (a + b) % prime;
-}
-
-unsigned int modsub(unsigned int a, unsigned int b, unsigned int prime)
-{
- return (a + prime - b) % prime;
-}
-
-unsigned int log2(unsigned int value)
-{
- unsigned int r = 0;
-
- while (value > 1)
- {
- ++r;
- value >>= 1;
- }
-
- return r;
-}
-
-// reverse(x) -> reverse(x+1)
-size_t rev_next(size_t x, size_t n)
-{
- do
- {
- n >>= 1;
- x ^= n;
- }
- while ((x & n) == 0);
-
- return x;
-}
-
-// swaps values with indices that have reversed bit representation (data[1100b] <-> data[0011b])
-template <typename T> void fft_copy_reorder(unsigned int* dest, const T* source, size_t size)
-{
- if (size <= 2)
- {
- std::copy(source, source + size, dest);
- return;
- }
-
- size_t r = 0;
-
- dest[0] = source[0];
-
- for (size_t x = 1; x < size; ++x)
- {
- r = rev_next(r, size);
-
- dest[x] = source[r];
- }
-}
-
-void fft_recursive(unsigned int* data, size_t size, unsigned int prime, const unsigned int* roots)
-{
- size /= 2;
-
- if (size > 1)
- {
- fft_recursive(data, size, prime, roots - 1);
- fft_recursive(data + size, size, prime, roots - 1);
- }
-
- unsigned int root = *roots;
- unsigned int r = 1;
-
- for (size_t i = 0; i < size; ++i)
- {
- unsigned int a = data[i];
- unsigned int b = modmul(data[i + size], r, prime);
-
- data[i] = modadd(a, b, prime);
- data[i + size] = modsub(a, b, prime);
-
- r = modmul(r, root, prime);
- }
-}
-
-void fft_iterative(unsigned int* data, size_t size, unsigned int prime, const unsigned int* roots)
-{
- size_t step = 1;
-
- unsigned int nstep = 0;
-
- while (step < size)
- {
- unsigned int root = roots[++nstep];
-
- size_t half_step = step;
- step *= 2;
-
- unsigned int r = 1;
-
- for (size_t j = 0; j < half_step; ++j)
- {
- for (size_t i = j; i < size; i += step)
- {
- unsigned int a = data[i];
- unsigned int b = modmul(data[i + half_step], r, prime);
-
- data[i] = modadd(a, b, prime);
- data[i + half_step] = modsub(a, b, prime);
- }
-
- r = modmul(r, root, prime);
- }
- }
-}
-
-void dft(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table)
-{
- fft_copy_reorder(dest, source, N);
- fft_recursive(dest, N, prime, root_table + log2N);
-}
-
-void dft_i(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table)
-{
- fft_copy_reorder(dest, source, N);
- fft_iterative(dest, N, prime, root_table);
-}
-
-void ift(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
-{
- fft_copy_reorder(dest, source, N);
- fft_recursive(dest, N, prime, root_table + log2N);
-
- for (size_t i = 0; i < N; ++i)
- {
- dest[i] = modmul(dest[i], inv_N, prime);
- }
-}
-
-void ift_i(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
-{
- fft_copy_reorder(dest, source, N);
- fft_iterative(dest, N, prime, root_table);
-
- for (size_t i = 0; i < N; ++i)
- {
- dest[i] = modmul(dest[i], inv_N, prime);
- }
-}
-
-void fft_crt_carry(limb_t* dest, const unsigned int* conv0, const unsigned int* conv1, size_t N)
-{
- unsigned __int64 prime = static_cast<unsigned __int64>(fft_primes[0]) * fft_primes[1];
- unsigned int inv_p0_mod_p1 = fft_inv_p0_mod_p1;
-
- unsigned __int64 carry = 0;
-
- for (int i = 0; i < N; ++i)
- {
- // 64 x == 32 c0 mod 32 p0
- // 64 x == 32 c1 mod 32 p1
-
- // x == p0*t + c0
- // p0*t + c0 == c1 mod p1
- // t == (c1 - c0) div p0 mod p1
-
- // t == t0 mod p1
-
- // x == p0 * t0 + c0
- // t0 == (c1 - c0) div p0 mod p1
-
- unsigned int t0 = modmul(modsub(conv1[i], conv0[i], fft_primes[1]), inv_p0_mod_p1, fft_primes[1]);
- unsigned __int64 x = static_cast<unsigned __int64>(fft_primes[0]) * t0 + conv0[i];
-
- carry += x;
-
- dest[i] = carry & 0xffff;
- carry >>= 16;
- }
-}
-
-void mul_fft(limb_t* dest, const limb_t* a, const limb_t* b,
- void (*dft)(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table),
- void (*ift)(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
- )
-{
- // p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
- const size_t N = 16;
-
- unsigned int log2N = log2(N);
-
- unsigned int convs[2][N];
-
- for (int p = 0; p < 2; ++p)
- {
- const unsigned int* root_table = fft_primitive_roots[p];
- unsigned int prime = fft_primes[p];
-
- unsigned int fft_a[N], fft_b[N], fft_c[N];
- dft(fft_a, a, N, log2N, prime, root_table);
- dft(fft_b, b, N, log2N, prime, root_table);
-
- for (size_t i = 0; i < N; ++i)
- {
- fft_c[i] = modmul(fft_a[i], fft_b[i], prime);
- }
-
- const unsigned int* inv_root_table = fft_inv_primitive_roots[p];
- unsigned int inv_N = fft_inv_N[p][log2N];
-
- ift(convs[p], fft_c, N, log2N, prime, inv_root_table, inv_N);
- }
-
- fft_crt_carry(dest, convs[0], convs[1], N);
-}
-
-void sqr_fft(limb_t* dest, const limb_t* a,
- void (*dft)(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table),
- void (*ift)(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, const unsigned int* root_table, unsigned int inv_N)
- )
-{
- // p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
- const size_t N = 16;
-
- unsigned int log2N = log2(N);
-
- unsigned int convs[2][N];
-
- for (int p = 0; p < 2; ++p)
- {
- const unsigned int* root_table = fft_primitive_roots[p];
- unsigned int prime = fft_primes[p];
-
- unsigned int fft_a[N], fft_c[N];
- dft(fft_a, a, N, log2N, prime, root_table);
-
- for (size_t i = 0; i < N; ++i)
- {
- fft_c[i] = modmul(fft_a[i], fft_a[i], prime);
- }
-
- const unsigned int* inv_root_table = fft_inv_primitive_roots[p];
- unsigned int inv_N = fft_inv_N[p][log2N];
-
- ift(convs[p], fft_c, N, log2N, prime, inv_root_table, inv_N);
- }
-
- fft_crt_carry(dest, convs[0], convs[1], N);
-}
-
-void mul_basecase(limb_t* dest, const limb_t* a, const limb_t* b)
-{
- typedef unsigned int limb2_t;
-
- const size_t N = 16;
-
- memset(dest, 0, sizeof(limb_t) * N);
-
- limb_t* i = dest;
-
- for (const limb_t* li = a; li != a + N/2; ++li, ++i)
- {
- limb_t carry = 0;
-
- limb_t* ci = i;
-
- for (const limb_t* ri = b; ri != b + N/2; ++ri)
- {
- limb2_t result = static_cast<limb2_t>(*li) * *ri + *ci + carry;
-
- *ci++ = static_cast<limb_t>(result & 0xffff);
-
- carry = static_cast<limb_t>(result >> 16);
- }
-
- while (carry != 0)
- {
- limb2_t result = static_cast<limb2_t>(*ci) + carry;
-
- *ci++ = static_cast<limb_t>(result & 0xffff);
-
- carry = static_cast<limb_t>(result >> 16);
- }
- }
-}
-
-int main()
-{
-// limb_t a[16] = {65535, 65533, 65531, 65529, 65527, 65525, 65523, 65521};
-// limb_t b[16] = {65535, 65534, 65533, 65532, 65531, 65530, 65529, 65528};
- limb_t a[16] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
- limb_t b[16] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-/*
- limb_t a[256] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-
- limb_t b[256] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-*/
- limb_t c[16] = {};
-
- P("A = ", a, 8);
- P("B = ", b, 8);
-
- int count = 10000;
- LARGE_INTEGER start, end, freq;
-
- QueryPerformanceFrequency(&freq);
-
- QueryPerformanceCounter(&start);
- for (int i = 0; i < count; ++i)
- mul_fft(c, a, b, dft, ift);
- QueryPerformanceCounter(&end);
-
- P("Recursive: C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
- QueryPerformanceCounter(&start);
- for (int i = 0; i < count; ++i)
- mul_fft(c, a, b, dft_i, ift_i);
- QueryPerformanceCounter(&end);
-
- P("Iterative: C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
- QueryPerformanceCounter(&start);
- for (int i = 0; i < count; ++i)
- sqr_fft(c, a, dft, ift);
- QueryPerformanceCounter(&end);
-
- P("Recursive SQR: C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
- QueryPerformanceCounter(&start);
- for (int i = 0; i < count; ++i)
- sqr_fft(c, a, dft_i, ift_i);
- QueryPerformanceCounter(&end);
-
- P("Iterative SQR: C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
- QueryPerformanceCounter(&start);
- for (int i = 0; i < count; ++i)
- mul_basecase(c, a, b);
- QueryPerformanceCounter(&end);
-
- P("Basecase : C = ", c, 16); std::cout << (float)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-}

Deleted: sandbox/SOC/2007/bigint/fft/main_word_weird.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/main_word_weird.cpp 2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
+++ (empty file)
@@ -1,382 +0,0 @@
-#include <iostream>
-#include <windows.h>
-
-#define __int64 long long
-
-#include "primes.h"
-
-const unsigned int base = 65536;
-typedef unsigned short limb_t;
-
-template <typename T> void P(const char* text, const T* data, size_t size)
-{
- std::cout << text;
- for (size_t i = 0; i < size; ++i) std::cout << " " << std::dec << data[i] << std::dec;
- std::cout << std::endl;
-}
-
-static const double fft_primes_d[] = {1.0 / 70254593.0, 1.0 / 81788929.0};
-
-unsigned int modmul(unsigned int a, unsigned int b, unsigned int prime, double prime_d)
-{
- unsigned int r = a * b;
- r = r - prime * (unsigned int)(prime_d * (double) a * (double) b);
-
- return (r >= prime ? r - prime : r);
-}
-
-unsigned int modadd(unsigned int a, unsigned int b, unsigned int prime)
-{
- return (a + b) % prime;
-}
-
-unsigned int modsub(unsigned int a, unsigned int b, unsigned int prime)
-{
- return (a + prime - b) % prime;
-}
-
-unsigned int log2(unsigned int value)
-{
- unsigned int r = 0;
-
- while (value > 1)
- {
- ++r;
- value >>= 1;
- }
-
- return r;
-}
-
-// reverse(x) -> reverse(x+1)
-size_t rev_next(size_t x, size_t n)
-{
- do
- {
- n >>= 1;
- x ^= n;
- }
- while ((x & n) == 0);
-
- return x;
-}
-
-// swaps values with indices that have reversed bit representation (data[1100b] <-> data[0011b])
-template <typename T> void fft_copy_reorder(unsigned int* dest, const T* source, size_t size)
-{
- if (size <= 2)
- {
- std::copy(source, source + size, dest);
- return;
- }
-
- size_t r = 0;
-
- dest[0] = source[0];
-
- for (size_t x = 1; x < size; ++x)
- {
- r = rev_next(r, size);
-
- dest[x] = source[r];
- }
-}
-
-void fft_recursive(unsigned int* data, size_t size, unsigned int prime, double prime_d, const unsigned int* roots)
-{
- size /= 2;
-
- if (size > 1)
- {
- fft_recursive(data, size, prime, prime_d, roots - 1);
- fft_recursive(data + size, size, prime, prime_d, roots - 1);
- }
-
- unsigned int root = *roots;
- unsigned int r = 1;
-
- for (size_t i = 0; i < size; ++i)
- {
- unsigned int a = data[i];
- unsigned int b = modmul(data[i + size], r, prime, prime_d);
-
- data[i] = modadd(a, b, prime);
- data[i + size] = modsub(a, b, prime);
-
- r = modmul(r, root, prime, prime_d);
- }
-}
-
-void fft_iterative(unsigned int* data, size_t size, unsigned int prime, double prime_d, const unsigned int* roots)
-{
- size_t step = 1;
-
- unsigned int nstep = 0;
-
- while (step < size)
- {
- unsigned int root = roots[++nstep];
-
- size_t half_step = step;
- step *= 2;
-
- unsigned int r = 1;
-
- for (size_t j = 0; j < half_step; ++j)
- {
- for (size_t i = j; i < size; i += step)
- {
- unsigned int a = data[i];
- unsigned int b = modmul(data[i + half_step], r, prime, prime_d);
-
- data[i] = modadd(a, b, prime);
- data[i + half_step] = modsub(a, b, prime);
- }
-
- r = modmul(r, root, prime, prime_d);
- }
- }
-}
-
-void dft(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table)
-{
- fft_copy_reorder(dest, source, N);
- fft_recursive(dest, N, prime, prime_d, root_table + log2N);
-}
-
-void dft_i(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table)
-{
- fft_copy_reorder(dest, source, N);
- fft_iterative(dest, N, prime, prime_d, root_table);
-}
-
-void ift(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table, unsigned int inv_N)
-{
- fft_copy_reorder(dest, source, N);
- fft_recursive(dest, N, prime, prime_d, root_table + log2N);
-
- for (size_t i = 0; i < N; ++i)
- {
- dest[i] = modmul(dest[i], inv_N, prime, prime_d);
- }
-}
-
-void ift_i(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table, unsigned int inv_N)
-{
- fft_copy_reorder(dest, source, N);
- fft_iterative(dest, N, prime, prime_d, root_table);
-
- for (size_t i = 0; i < N; ++i)
- {
- dest[i] = modmul(dest[i], inv_N, prime, prime_d);
- }
-}
-
-void fft_crt_carry(limb_t* dest, const unsigned int* conv0, const unsigned int* conv1, size_t N)
-{
- unsigned __int64 prime = static_cast<unsigned __int64>(fft_primes[0]) * fft_primes[1];
- unsigned int inv_p0_mod_p1 = fft_inv_p0_mod_p1;
-
- unsigned __int64 carry = 0;
-
- for (int i = 0; i < N; ++i)
- {
- // 64 x == 32 c0 mod 32 p0
- // 64 x == 32 c1 mod 32 p1
-
- // x == p0*t + c0
- // p0*t + c0 == c1 mod p1
- // t == (c1 - c0) div p0 mod p1
-
- // t == t0 mod p1
-
- // x == p0 * t0 + c0
- // t0 == (c1 - c0) div p0 mod p1
-
- unsigned int t0 = modmul(modsub(conv1[i], conv0[i], fft_primes[1]), inv_p0_mod_p1, fft_primes[1], fft_primes_d[1]);
- unsigned __int64 x = static_cast<unsigned __int64>(fft_primes[0]) * t0 + conv0[i];
-
- carry += x;
-
- dest[i] = carry & 0xffff;
- carry >>= 16;
- }
-}
-
-void mul_fft(limb_t* dest, const limb_t* a, const limb_t* b,
- void (*dft)(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table),
- void (*ift)(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table, unsigned int inv_N)
- )
-{
- // p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
- const size_t N = 16;
-
- unsigned int log2N = log2(N);
-
- unsigned int convs[2][N];
-
- for (int p = 0; p < 2; ++p)
- {
- const unsigned int* root_table = fft_primitive_roots[p];
- unsigned int prime = fft_primes[p];
- double prime_d = fft_primes_d[p];
-
- unsigned int fft_a[N], fft_b[N], fft_c[N];
- dft(fft_a, a, N, log2N, prime, prime_d, root_table);
- dft(fft_b, b, N, log2N, prime, prime_d, root_table);
-
- for (size_t i = 0; i < N; ++i)
- {
- fft_c[i] = modmul(fft_a[i], fft_b[i], prime, prime_d);
- }
-
- const unsigned int* inv_root_table = fft_inv_primitive_roots[p];
- unsigned int inv_N = fft_inv_N[p][log2N];
-
- ift(convs[p], fft_c, N, log2N, prime, prime_d, inv_root_table, inv_N);
- }
-
- fft_crt_carry(dest, convs[0], convs[1], N);
-}
-
-void sqr_fft(limb_t* dest, const limb_t* a,
- void (*dft)(unsigned int* dest, const limb_t* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table),
- void (*ift)(unsigned int* dest, const unsigned int* source, size_t N, unsigned int log2N, unsigned int prime, double prime_d, const unsigned int* root_table, unsigned int inv_N)
- )
-{
- // p must be 1. prime, 2. of the form k*N+1, where N is higherPOT(length(a) + length(b)), and >= N*base^2
- const size_t N = 16;
-
- unsigned int log2N = log2(N);
-
- unsigned int convs[2][N];
-
- for (int p = 0; p < 2; ++p)
- {
- const unsigned int* root_table = fft_primitive_roots[p];
- unsigned int prime = fft_primes[p];
- double prime_d = fft_primes_d[p];
-
- unsigned int fft_a[N], fft_c[N];
- dft(fft_a, a, N, log2N, prime, prime_d, root_table);
-
- for (size_t i = 0; i < N; ++i)
- {
- fft_c[i] = modmul(fft_a[i], fft_a[i], prime, prime_d);
- }
-
- const unsigned int* inv_root_table = fft_inv_primitive_roots[p];
- unsigned int inv_N = fft_inv_N[p][log2N];
-
- ift(convs[p], fft_c, N, log2N, prime, prime_d, inv_root_table, inv_N);
- }
-
- fft_crt_carry(dest, convs[0], convs[1], N);
-}
-
-void mul_basecase(limb_t* dest, const limb_t* a, const limb_t* b)
-{
- typedef unsigned int limb2_t;
-
- const size_t N = 16;
-
- memset(dest, 0, sizeof(limb_t) * N);
-
- limb_t* i = dest;
-
- for (const limb_t* li = a; li != a + N/2; ++li, ++i)
- {
- limb_t carry = 0;
-
- limb_t* ci = i;
-
- for (const limb_t* ri = b; ri != b + N/2; ++ri)
- {
- limb2_t result = static_cast<limb2_t>(*li) * *ri + *ci + carry;
-
- *ci++ = static_cast<limb_t>(result & 0xffff);
-
- carry = static_cast<limb_t>(result >> 16);
- }
-
- while (carry != 0)
- {
- limb2_t result = static_cast<limb2_t>(*ci) + carry;
-
- *ci++ = static_cast<limb_t>(result & 0xffff);
-
- carry = static_cast<limb_t>(result >> 16);
- }
- }
-}
-
-int main()
-{
-// limb_t a[16] = {65535, 65533, 65531, 65529, 65527, 65525, 65523, 65521};
-// limb_t b[16] = {65535, 65534, 65533, 65532, 65531, 65530, 65529, 65528};
- limb_t a[16] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
- limb_t b[16] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-/*
- limb_t a[256] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-
- limb_t b[256] = {65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535,
- 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535};
-*/
- limb_t c[16] = {};
-
- P("A = ", a, 8);
- P("B = ", b, 8);
-
- int count = 100000;
- LARGE_INTEGER start, end, freq;
-
- QueryPerformanceFrequency(&freq);
-
- QueryPerformanceCounter(&start);
- for (int i = 0; i < count; ++i)
- mul_fft(c, a, b, dft, ift);
- QueryPerformanceCounter(&end);
-
- P("Recursive: C = ", c, 16); std::cout << (double)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
- QueryPerformanceCounter(&start);
- for (int i = 0; i < count; ++i)
- mul_fft(c, a, b, dft_i, ift_i);
- QueryPerformanceCounter(&end);
-
- P("Iterative: C = ", c, 16); std::cout << (double)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
- QueryPerformanceCounter(&start);
- for (int i = 0; i < count; ++i)
- sqr_fft(c, a, dft, ift);
- QueryPerformanceCounter(&end);
-
- P("Recursive SQR: C = ", c, 16); std::cout << (double)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
- QueryPerformanceCounter(&start);
- for (int i = 0; i < count; ++i)
- sqr_fft(c, a, dft_i, ift_i);
- QueryPerformanceCounter(&end);
-
- P("Iterative SQR: C = ", c, 16); std::cout << (double)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-
- QueryPerformanceCounter(&start);
- for (int i = 0; i < count; ++i)
- mul_basecase(c, a, b);
- QueryPerformanceCounter(&end);
-
- P("Basecase : C = ", c, 16); std::cout << (double)(end.QuadPart - start.QuadPart) / freq.QuadPart << std::endl;
-}

Added: sandbox/SOC/2007/bigint/fft/primes.h
==============================================================================
--- (empty file)
+++ sandbox/SOC/2007/bigint/fft/primes.h 2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
@@ -0,0 +1,27 @@
+static const unsigned int fft_primes[] = {70254593, 81788929};
+
+// 70254593 = 67 * 2^20 + 1
+// 81788929 = 39 * 2^21 + 1
+// fft_max_N * base^2 should be less than 70254593 * 81788929 = 5746047918800897 (base = 65536)
+// fft_max_N should be greater or equal than 2^20 = 1048576 due to primes structure
+const unsigned int fft_max_N = 1048576;
+
+static const unsigned int fft_primitive_roots[2][21] =
+{
+ {1, 70254592, 50348549, 19383190, 12538576, 54567172, 35010243, 946420, 69401582, 12802663, 39325177, 61326249, 20368503, 32379324, 58783494, 58098554, 49169846, 5764801, 2401, 49, 7},
+ {1, 81788928, 23980934, 1883838, 32049591, 79935288, 22685020, 73127609, 43778833, 17637033, 63153960, 14279076, 36624597, 52261024, 54282709, 36693892, 78305599, 42953381, 41608963, 418609, 647}
+};
+
+static const unsigned int fft_inv_primitive_roots[2][21] =
+{
+ {1, 70254592, 19906044, 52459594, 3183438, 53518501, 39073710, 19737011, 63685585, 64302287, 30531912, 69788710, 18847954, 64069532, 36127269, 61117601, 3300726, 63813627, 61505687, 63085757, 20072741},
+ {1, 81788928, 57807995, 1977387, 47596280, 37757181, 33726541, 75044061, 38800727, 16012482, 79766222, 64674754, 434079, 26744811, 67309642, 61891499, 67784436, 73579077, 56837587, 80643596, 76858839}
+};
+
+static const unsigned int fft_inv_N[2][21] =
+{
+ {1, 35127297, 52690945, 61472769, 65863681, 68059137, 69156865, 69705729, 69980161, 70117377, 70185985, 70220289, 70237441, 70246017, 70250305, 70252449, 70253521, 70254057, 70254325, 70254459, 70254526},
+ {1, 40894465, 61341697, 71565313, 76677121, 79233025, 80510977, 81149953, 81469441, 81629185, 81709057, 81748993, 81768961, 81778945, 81783937, 81786433, 81787681, 81788305, 81788617, 81788773, 81788851}
+};
+
+const unsigned int fft_inv_p0_mod_p1 = 37176793;

Modified: sandbox/SOC/2007/bigint/fft/roots.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/roots.cpp (original)
+++ sandbox/SOC/2007/bigint/fft/roots.cpp 2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
@@ -75,11 +75,40 @@
                 powers[i] = power;
         }
 
- std::cout << std::endl;
+ // convolution's element is N * base^2 at max, where N is size of each vector (provided we do N * N -> 2N multiplication)
+ unsigned __int64 modulus = static_cast<unsigned __int64>(primes[0]) * primes[1];
+
+ modulus /= base;
+ modulus /= base;
+
+ // now modulus is upper bound for N; round it down to 2^k
+
+ unsigned int maxN = 1;
+ while (maxN * 2 <= modulus) maxN *= 2;
+
+ std::cout << "// fft_max_N * base^2 should be less than " << primes[0] << " * " << primes[1] << " = " << static_cast<unsigned __int64>(primes[0]) * primes[1];
+ std::cout << " (base = " << base << ")\n";
 
         unsigned int power_base = std::min(powers[0], powers[1]);
         unsigned int power = 1 << power_base;
 
+ std::cout << "// fft_max_N should be greater or equal than 2^" << power_base << " = " << power << " due to primes structure" << std::endl;
+
+ if (power > maxN)
+ {
+ while (power > maxN)
+ {
+ --power_base;
+ power /= 2;
+ }
+ }
+ else
+ {
+ maxN = power;
+ }
+
+ std::cout << "const unsigned int fft_max_N = " << maxN << ";\n\n";
+
         unsigned int roots[2];
 
         // find primitive roots (for power)

Deleted: sandbox/SOC/2007/bigint/fft/sieve.cpp
==============================================================================
--- sandbox/SOC/2007/bigint/fft/sieve.cpp 2007-08-02 17:02:48 EDT (Thu, 02 Aug 2007)
+++ (empty file)
@@ -1,42 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <algorithm>
-
-int main()
-{
- const size_t size = 100000000;
-
- bool* sieve = new bool[size];
-
- std::fill(sieve, sieve + size, true);
-
- sieve[0] = false;
- sieve[1] = false;
-
- for (size_t i = 2; i < size; ++i)
- {
- if (i % (size / 10000) == 0) {
- std::cout << (float)(i / (size / 10000)) / 100 << " %" << std::endl;
- }
-
- if (sieve[i])
- {
- for (size_t k = i + i; k < size; k += i)
- {
- sieve[k] = false;
- }
- }
- }
-
- std::ofstream out("primes.txt");
-
- for (size_t i = 0; i < size; ++i)
- {
- if (sieve[i] && i % 1048576 == 1)
- {
- out << i << " ";
- }
- }
-
- delete[] sieve;
-}


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk