Boost logo

Boost-Commit :

From: john_at_[hidden]
Date: 2008-06-10 04:12:18


Author: johnmaddock
Date: 2008-06-10 04:12:17 EDT (Tue, 10 Jun 2008)
New Revision: 46289
URL: http://svn.boost.org/trac/boost/changeset/46289

Log:
Added SSE2 optimised Lanczos approximations.
Reordered the tgamma function to reduce the number of comparisons needed.
Added:
   sandbox/math_toolkit/boost/math/special_functions/detail/lanczos_sse2.hpp (contents, props changed)
Text files modified:
   sandbox/math_toolkit/boost/math/special_functions/gamma.hpp | 39 +++++++++++++++++++++------------------
   sandbox/math_toolkit/boost/math/special_functions/lanczos.hpp | 4 ++++
   2 files changed, 25 insertions(+), 18 deletions(-)

Added: sandbox/math_toolkit/boost/math/special_functions/detail/lanczos_sse2.hpp
==============================================================================
--- (empty file)
+++ sandbox/math_toolkit/boost/math/special_functions/detail/lanczos_sse2.hpp 2008-06-10 04:12:17 EDT (Tue, 10 Jun 2008)
@@ -0,0 +1,201 @@
+// (C) Copyright John Maddock 2006.
+// Use, modification and distribution are subject to the
+// Boost Software License, Version 1.0. (See accompanying file
+// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
+#define BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#include <emmintrin.h>
+
+#ifdef __GNUC__
+#define ALIGN16 __attribute__((aligned(16)))
+#else
+#define ALIGN16 __declspec(align(16))
+#endif
+
+namespace boost{ namespace math{ namespace lanczos{
+
+template <>
+inline double lanczos13m53::lanczos_sum<double>(const double& x)
+{
+ static const ALIGN16 double coeff[26] = {
+ static_cast<double>(2.506628274631000270164908177133837338626L),
+ static_cast<double>(1u),
+ static_cast<double>(210.8242777515793458725097339207133627117L),
+ static_cast<double>(66u),
+ static_cast<double>(8071.672002365816210638002902272250613822L),
+ static_cast<double>(1925u),
+ static_cast<double>(186056.2653952234950402949897160456992822L),
+ static_cast<double>(32670u),
+ static_cast<double>(2876370.628935372441225409051620849613599L),
+ static_cast<double>(357423u),
+ static_cast<double>(31426415.58540019438061423162831820536287L),
+ static_cast<double>(2637558u),
+ static_cast<double>(248874557.8620541565114603864132294232163L),
+ static_cast<double>(13339535u),
+ static_cast<double>(1439720407.311721673663223072794912393972L),
+ static_cast<double>(45995730u),
+ static_cast<double>(6039542586.35202800506429164430729792107L),
+ static_cast<double>(105258076u),
+ static_cast<double>(17921034426.03720969991975575445893111267L),
+ static_cast<double>(150917976u),
+ static_cast<double>(35711959237.35566804944018545154716670596L),
+ static_cast<double>(120543840u),
+ static_cast<double>(42919803642.64909876895789904700198885093L),
+ static_cast<double>(39916800u),
+ static_cast<double>(23531376880.41075968857200767445163675473L),
+ static_cast<double>(0u)
+ };
+ register __m128d vx = _mm_load1_pd(&x);
+ register __m128d sum_even = _mm_load_pd(coeff);
+ register __m128d sum_odd = _mm_load_pd(coeff+2);
+ register __m128d nc_odd, nc_even;
+ register __m128d vx2 = _mm_mul_pd(vx, vx);
+
+ sum_even = _mm_mul_pd(sum_even, vx2);
+ nc_even = _mm_load_pd(coeff + 4);
+ sum_odd = _mm_mul_pd(sum_odd, vx2);
+ nc_odd = _mm_load_pd(coeff + 6);
+ sum_even = _mm_add_pd(sum_even, nc_even);
+ sum_odd = _mm_add_pd(sum_odd, nc_odd);
+
+ sum_even = _mm_mul_pd(sum_even, vx2);
+ nc_even = _mm_load_pd(coeff + 8);
+ sum_odd = _mm_mul_pd(sum_odd, vx2);
+ nc_odd = _mm_load_pd(coeff + 10);
+ sum_even = _mm_add_pd(sum_even, nc_even);
+ sum_odd = _mm_add_pd(sum_odd, nc_odd);
+
+ sum_even = _mm_mul_pd(sum_even, vx2);
+ nc_even = _mm_load_pd(coeff + 12);
+ sum_odd = _mm_mul_pd(sum_odd, vx2);
+ nc_odd = _mm_load_pd(coeff + 14);
+ sum_even = _mm_add_pd(sum_even, nc_even);
+ sum_odd = _mm_add_pd(sum_odd, nc_odd);
+
+ sum_even = _mm_mul_pd(sum_even, vx2);
+ nc_even = _mm_load_pd(coeff + 16);
+ sum_odd = _mm_mul_pd(sum_odd, vx2);
+ nc_odd = _mm_load_pd(coeff + 18);
+ sum_even = _mm_add_pd(sum_even, nc_even);
+ sum_odd = _mm_add_pd(sum_odd, nc_odd);
+
+ sum_even = _mm_mul_pd(sum_even, vx2);
+ nc_even = _mm_load_pd(coeff + 20);
+ sum_odd = _mm_mul_pd(sum_odd, vx2);
+ nc_odd = _mm_load_pd(coeff + 22);
+ sum_even = _mm_add_pd(sum_even, nc_even);
+ sum_odd = _mm_add_pd(sum_odd, nc_odd);
+
+ sum_even = _mm_mul_pd(sum_even, vx2);
+ nc_even = _mm_load_pd(coeff + 24);
+ sum_odd = _mm_mul_pd(sum_odd, vx);
+ sum_even = _mm_add_pd(sum_even, nc_even);
+ sum_even = _mm_add_pd(sum_even, sum_odd);
+
+
+ double ALIGN16 t[2];
+ _mm_store_pd(t, sum_even);
+
+ return t[0] / t[1];
+}
+
+template <>
+inline double lanczos13m53::lanczos_sum_expG_scaled<double>(const double& x)
+{
+ static const ALIGN16 double coeff[26] = {
+ static_cast<double>(0.006061842346248906525783753964555936883222L),
+ static_cast<double>(1u),
+ static_cast<double>(0.5098416655656676188125178644804694509993L),
+ static_cast<double>(66u),
+ static_cast<double>(19.51992788247617482847860966235652136208L),
+ static_cast<double>(1925u),
+ static_cast<double>(449.9445569063168119446858607650988409623L),
+ static_cast<double>(32670u),
+ static_cast<double>(6955.999602515376140356310115515198987526L),
+ static_cast<double>(357423u),
+ static_cast<double>(75999.29304014542649875303443598909137092L),
+ static_cast<double>(2637558u),
+ static_cast<double>(601859.6171681098786670226533699352302507L),
+ static_cast<double>(13339535u),
+ static_cast<double>(3481712.15498064590882071018964774556468L),
+ static_cast<double>(45995730u),
+ static_cast<double>(14605578.08768506808414169982791359218571L),
+ static_cast<double>(105258076u),
+ static_cast<double>(43338889.32467613834773723740590533316085L),
+ static_cast<double>(150917976u),
+ static_cast<double>(86363131.28813859145546927288977868422342L),
+ static_cast<double>(120543840u),
+ static_cast<double>(103794043.1163445451906271053616070238554L),
+ static_cast<double>(39916800u),
+ static_cast<double>(56906521.91347156388090791033559122686859L),
+ static_cast<double>(0u)
+ };
+ register __m128d vx = _mm_load1_pd(&x);
+ register __m128d sum_even = _mm_load_pd(coeff);
+ register __m128d sum_odd = _mm_load_pd(coeff+2);
+ register __m128d nc_odd, nc_even;
+ register __m128d vx2 = _mm_mul_pd(vx, vx);
+
+ sum_even = _mm_mul_pd(sum_even, vx2);
+ nc_even = _mm_load_pd(coeff + 4);
+ sum_odd = _mm_mul_pd(sum_odd, vx2);
+ nc_odd = _mm_load_pd(coeff + 6);
+ sum_even = _mm_add_pd(sum_even, nc_even);
+ sum_odd = _mm_add_pd(sum_odd, nc_odd);
+
+ sum_even = _mm_mul_pd(sum_even, vx2);
+ nc_even = _mm_load_pd(coeff + 8);
+ sum_odd = _mm_mul_pd(sum_odd, vx2);
+ nc_odd = _mm_load_pd(coeff + 10);
+ sum_even = _mm_add_pd(sum_even, nc_even);
+ sum_odd = _mm_add_pd(sum_odd, nc_odd);
+
+ sum_even = _mm_mul_pd(sum_even, vx2);
+ nc_even = _mm_load_pd(coeff + 12);
+ sum_odd = _mm_mul_pd(sum_odd, vx2);
+ nc_odd = _mm_load_pd(coeff + 14);
+ sum_even = _mm_add_pd(sum_even, nc_even);
+ sum_odd = _mm_add_pd(sum_odd, nc_odd);
+
+ sum_even = _mm_mul_pd(sum_even, vx2);
+ nc_even = _mm_load_pd(coeff + 16);
+ sum_odd = _mm_mul_pd(sum_odd, vx2);
+ nc_odd = _mm_load_pd(coeff + 18);
+ sum_even = _mm_add_pd(sum_even, nc_even);
+ sum_odd = _mm_add_pd(sum_odd, nc_odd);
+
+ sum_even = _mm_mul_pd(sum_even, vx2);
+ nc_even = _mm_load_pd(coeff + 20);
+ sum_odd = _mm_mul_pd(sum_odd, vx2);
+ nc_odd = _mm_load_pd(coeff + 22);
+ sum_even = _mm_add_pd(sum_even, nc_even);
+ sum_odd = _mm_add_pd(sum_odd, nc_odd);
+
+ sum_even = _mm_mul_pd(sum_even, vx2);
+ nc_even = _mm_load_pd(coeff + 24);
+ sum_odd = _mm_mul_pd(sum_odd, vx);
+ sum_even = _mm_add_pd(sum_even, nc_even);
+ sum_even = _mm_add_pd(sum_even, sum_odd);
+
+
+ double ALIGN16 t[2];
+ _mm_store_pd(t, sum_even);
+
+ return t[0] / t[1];
+}
+
+} // namespace lanczos
+} // namespace math
+} // namespace boost
+
+#endif // BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS
+
+
+
+

Modified: sandbox/math_toolkit/boost/math/special_functions/gamma.hpp
==============================================================================
--- sandbox/math_toolkit/boost/math/special_functions/gamma.hpp (original)
+++ sandbox/math_toolkit/boost/math/special_functions/gamma.hpp 2008-06-10 04:12:17 EDT (Tue, 10 Jun 2008)
@@ -142,26 +142,29 @@
 #endif
    static const char* function = "boost::math::tgamma<%1%>(%1%)";
 
- if((z <= 0) && (floor(z) == z))
- return policies::raise_pole_error<T>(function, "Evaluation of tgamma at a negative integer %1%.", z, pol);
- if(z <= -20)
+ if(z <= 0)
    {
- result = gamma_imp(-z, pol, l) * sinpx(z);
- if((fabs(result) < 1) && (tools::max_value<T>() * fabs(result) < boost::math::constants::pi<T>()))
- return policies::raise_overflow_error<T>(function, "Result of tgamma is too large to represent.", pol);
- result = -boost::math::constants::pi<T>() / result;
- if(result == 0)
- return policies::raise_underflow_error<T>(function, "Result of tgamma is too small to represent.", pol);
- if((boost::math::fpclassify)(result) == (int)FP_SUBNORMAL)
- return policies::raise_denorm_error<T>(function, "Result of tgamma is denormalized.", result, pol);
- return result;
- }
+ if(floor(z) == z)
+ return policies::raise_pole_error<T>(function, "Evaluation of tgamma at a negative integer %1%.", z, pol);
+ if(z <= -20)
+ {
+ result = gamma_imp(-z, pol, l) * sinpx(z);
+ if((fabs(result) < 1) && (tools::max_value<T>() * fabs(result) < boost::math::constants::pi<T>()))
+ return policies::raise_overflow_error<T>(function, "Result of tgamma is too large to represent.", pol);
+ result = -boost::math::constants::pi<T>() / result;
+ if(result == 0)
+ return policies::raise_underflow_error<T>(function, "Result of tgamma is too small to represent.", pol);
+ if((boost::math::fpclassify)(result) == (int)FP_SUBNORMAL)
+ return policies::raise_denorm_error<T>(function, "Result of tgamma is denormalized.", result, pol);
+ return result;
+ }
 
- // shift z to > 1:
- while(z < 1)
- {
- result /= z;
- z += 1;
+ // shift z to > 1:
+ while(z < 0)
+ {
+ result /= z;
+ z += 1;
+ }
    }
    if((floor(z) == z) && (z < max_factorial<T>::value))
    {

Modified: sandbox/math_toolkit/boost/math/special_functions/lanczos.hpp
==============================================================================
--- sandbox/math_toolkit/boost/math/special_functions/lanczos.hpp (original)
+++ sandbox/math_toolkit/boost/math/special_functions/lanczos.hpp 2008-06-10 04:12:17 EDT (Tue, 10 Jun 2008)
@@ -1229,6 +1229,10 @@
 } // namespace math
 } // namespace boost
 
+#if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(__SSE2__)
+#include <boost/math/special_functions/detail/lanczos_sse2.hpp>
+#endif
+
 #endif // BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS
 
 


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk