#include #include extern "C" { # include } #include #include #include #include #include #include #include #include #include #include #include namespace boost { namespace numeric { namespace ublas { template BOOST_UBLAS_INLINE M block_prod_goto (const matrix_expression &e1, const matrix_expression &e2, row_major_tag) { typedef M matrix_type; const std::size_t i_block_size = IBS; const std::size_t j_block_size = JBS; const std::size_t k_block_size = KBS; typedef const E1 expression1_type; typedef const E2 expression2_type; typedef typename M::size_type size_type; typedef typename M::value_type value_type; M m (e1 ().size1 (), e2 ().size2 ()); size_type i_size = e1 ().size1 (); size_type j_size = e2 ().size2 (); size_type k_size = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ()); m.assign (zero_matrix (m.size1 (), m.size2 ())); for (size_type i_begin = 0; i_begin < i_size; i_begin += i_block_size) { size_type i_end = i_begin + std::min (i_size - i_begin, i_block_size); for (size_type k_begin = 0; k_begin < k_size; k_begin += k_block_size) { size_type k_end = k_begin + std::min (k_size - k_begin, k_block_size); // const matrix > e1_range (project (e1 (), range (i_begin, i_end), range (k_begin, k_end))); const matrix e1_range (project (e1 (), range (i_begin, i_end), range (k_begin, k_end))); for (size_type j_begin = 0; j_begin < j_size; j_begin += j_block_size) { size_type j_end = j_begin + std::min (j_size - j_begin, j_block_size); matrix_range m_range (m, range (i_begin, i_end), range (j_begin, j_end)); if (boost::is_same::value) { const matrix_range e2_range (e2 (), range (k_begin, k_end), range (j_begin, j_end)); m_range.plus_assign (prod (e1_range, e2_range)); } else { // const matrix > e2_range (project (e2 (), range (k_begin, k_end), range (j_begin, j_end))); const matrix e2_range (project (e2 (), range (k_begin, k_end), range (j_begin, j_end))); m_range.plus_assign (prod (e1_range, e2_range)); } } } } return m; } template BOOST_UBLAS_INLINE M block_prod_goto (const matrix_expression &e1, const matrix_expression &e2, column_major_tag) { typedef M matrix_type; const std::size_t i_block_size = IBS; const std::size_t j_block_size = JBS; const std::size_t k_block_size = KBS; typedef const E1 expression1_type; typedef const E2 expression2_type; typedef typename M::size_type size_type; typedef typename M::value_type value_type; M m (e1 ().size1 (), e2 ().size2 ()); size_type i_size = e1 ().size1 (); size_type j_size = e2 ().size2 (); size_type k_size = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ()); m.assign (zero_matrix (m.size1 (), m.size2 ())); for (size_type j_begin = 0; j_begin < j_size; j_begin += j_block_size) { size_type j_end = j_begin + std::min (j_size - j_begin, j_block_size); for (size_type k_begin = 0; k_begin < k_size; k_begin += k_block_size) { size_type k_end = k_begin + std::min (k_size - k_begin, k_block_size); // const matrix > e2_range (project (e2 (), range (k_begin, k_end), range (j_begin, j_end))); const matrix e2_range (project (e2 (), range (k_begin, k_end), range (j_begin, j_end))); for (size_type i_begin = 0; i_begin < i_size; i_begin += i_block_size) { size_type i_end = i_begin + std::min (i_size - i_begin, i_block_size); matrix_range m_range (m, range (i_begin, i_end), range (j_begin, j_end)); if (boost::is_same::value) { const matrix_range e1_range (e1 (), range (i_begin, i_end), range (k_begin, k_end)); m_range.plus_assign (prod (e1_range, e2_range)); } else { // const matrix > e1_range (project (e1 (), range (i_begin, i_end), range (k_begin, k_end))); const matrix e1_range (project (e1 (), range (i_begin, i_end), range (k_begin, k_end))); m_range.plus_assign (prod (e1_range, e2_range)); } } } } return m; } // Dispatcher template BOOST_UBLAS_INLINE M block_prod_goto (const matrix_expression &e1, const matrix_expression &e2) { typedef typename M::orientation_category orientation_category; return block_prod_goto (e1, e2, orientation_category ()); } }}} namespace ublas = boost::numeric::ublas; using namespace std; BOOST_UBLAS_INLINE void cblas_prod (ublas::matrix >& C, const ublas::matrix >& A, const ublas::matrix >& B) { // C(m,n) <- aplha A(m,k) B(k,n) + beta C(m,n) // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); int m = C.size1(); int n = C.size2(); int k = A.size2(); BOOST_UBLAS_CHECK (C.size1() == A.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (A.size2() == B.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (B.size2() == C.size2(), ublas::bad_size ()); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, &A.data()[0], A.size2(), &B.data()[0], B.size2(), 1.0, &C.data()[0], C.size2()); } BOOST_UBLAS_INLINE void cblas_prod (ublas::matrix >& C, const ublas::matrix >& A, const ublas::matrix >& B) { // C(m,n) <- aplha A(m,k) B(k,n) + beta C(m,n) // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); int m = C.size1(); int n = C.size2(); int k = A.size2(); BOOST_UBLAS_CHECK (C.size1() == A.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (A.size2() == B.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (B.size2() == C.size2(), ublas::bad_size ()); cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, m, n, k, 1.0, &A.data()[0], A.size1(), &B.data()[0], B.size2(), 1.0, &C.data()[0], C.size2()); } BOOST_UBLAS_INLINE void cblas_prod (ublas::matrix >& C, const ublas::matrix >& A, const ublas::matrix >& B) { // C(m,n) <- aplha A(m,k) B(k,n) + beta C(m,n) // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); int m = C.size1(); int n = C.size2(); int k = A.size2(); BOOST_UBLAS_CHECK (C.size1() == A.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (A.size2() == B.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (B.size2() == C.size2(), ublas::bad_size ()); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0, &A.data()[0], A.size2(), &B.data()[0], B.size1(), 1.0, &C.data()[0], C.size2()); } BOOST_UBLAS_INLINE void cblas_prod (ublas::matrix >& C, const ublas::matrix >& A, const ublas::matrix >& B) { // C(m,n) <- aplha A(m,k) B(k,n) + beta C(m,n) // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); int m = C.size1(); int n = C.size2(); int k = A.size2(); BOOST_UBLAS_CHECK (C.size1() == A.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (A.size2() == B.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (B.size2() == C.size2(), ublas::bad_size ()); cblas_dgemm(CblasRowMajor, CblasTrans, CblasTrans, m, n, k, 1.0, &A.data()[0], A.size1(), &B.data()[0], B.size1(), 1.0, &C.data()[0], C.size2()); } BOOST_UBLAS_INLINE void cblas_prod (ublas::matrix >& C, const ublas::matrix >& A, const ublas::matrix >& B) { // C(m,n) <- aplha A(m,k) B(k,n) + beta C(m,n) // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); int m = C.size1(); int n = C.size2(); int k = A.size2(); BOOST_UBLAS_CHECK (C.size1() == A.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (A.size2() == B.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (B.size2() == C.size2(), ublas::bad_size ()); cblas_dgemm(CblasColMajor, CblasTrans, CblasTrans, m, n, k, 1.0, &A.data()[0], A.size2(), &B.data()[0], B.size2(), 1.0, &C.data()[0], C.size1()); } BOOST_UBLAS_INLINE void cblas_prod (ublas::matrix >& C, const ublas::matrix >& A, const ublas::matrix >& B) { // C(m,n) <- aplha A(m,k) B(k,n) + beta C(m,n) // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); int m = C.size1(); int n = C.size2(); int k = A.size2(); BOOST_UBLAS_CHECK (C.size1() == A.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (A.size2() == B.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (B.size2() == C.size2(), ublas::bad_size ()); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0, &A.data()[0], A.size1(), &B.data()[0], B.size2(), 1.0, &C.data()[0], C.size1()); } BOOST_UBLAS_INLINE void cblas_prod (ublas::matrix >& C, const ublas::matrix >& A, const ublas::matrix >& B) { // C(m,n) <- aplha A(m,k) B(k,n) + beta C(m,n) // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); int m = C.size1(); int n = C.size2(); int k = A.size2(); BOOST_UBLAS_CHECK (C.size1() == A.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (A.size2() == B.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (B.size2() == C.size2(), ublas::bad_size ()); cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, n, k, 1.0, &A.data()[0], A.size2(), &B.data()[0], B.size1(), 1.0, &C.data()[0], C.size1()); } BOOST_UBLAS_INLINE void cblas_prod (ublas::matrix >& C, const ublas::matrix >& A, const ublas::matrix >& B) { // C(m,n) <- aplha A(m,k) B(k,n) + beta C(m,n) // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); int m = C.size1(); int n = C.size2(); int k = A.size2(); BOOST_UBLAS_CHECK (C.size1() == A.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (A.size2() == B.size1(), ublas::bad_size ()); BOOST_UBLAS_CHECK (B.size2() == C.size2(), ublas::bad_size ()); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, &A.data()[0], A.size1(), &B.data()[0], B.size1(), 1.0, &C.data()[0], C.size1()); } void zeromatrix(ublas::matrix& A) { for(size_t i = 0; i& A) { for(size_t i = 0; i& A) { for(size_t i = 0; i& A) { for(size_t i = 0; i int equals(const M& lhs, const E& rhs) { return 2; } #endif typedef ublas::matrix RT; typedef ublas::matrix CT; #define mul(a,b,d) \ { \ zeromatrix(X); \ initmatrix(A); \ initmatrix(B); \ t.restart(); \ for (size_t a=0;a(A,B)); \ time = t.elapsed(); \ cout << setw(8) << time << flush; \ cerr << equals(X,prod(A,B)) << endl; \ } #define mul6(xtype) \ { \ zeromatrix(X); \ initmatrix(A); \ initmatrix(B); \ t.restart(); \ X.plus_assign(ublas::block_prod_goto(A,B)); \ time = t.elapsed(); \ cout << setw(8) << time << flush; \ cerr << equals(X,prod(A,B)) << endl; \ } #define mul7 \ { \ zeromatrix(X); \ initmatrix(A); \ initmatrix(B); \ t.restart(); \ cblas_prod(X, A, B); \ time = t.elapsed(); \ /* cout << setw(8) << (2.0*(double(size1)*double(size2)*double(size3))/time)/1e+6 << flush; */ \ cout << setw(8) << time << flush; \ cerr << equals(X,prod(A,B)) << endl; \ } #define mulset(xtype,atype,btype) \ { \ xtype X(size1,size3); \ atype A(size1,size2); \ btype B(size2,size3); \ mul2(xtype); \ mul3(xtype); \ mul4(xtype); \ mul5(xtype); \ mul6(xtype); \ mul7 ; \ cout << endl; \ } /* mul(r,k,c); \ mul(r,c,k); \ mul(k,r,c); \ mul(k,c,r); \ mul(c,r,k); \ mul(c,k,r); \ */ void test(const size_t size1, const size_t size2, const size_t size3) { boost::timer t; double time; cout << "size of metrices - " << size1 << "x" << size2 << "*" << size2 << "x" << size3 << endl << endl; cout << " " << " prod axpy opb" // << " rkc rck krc kcr crk ckr" << " block goto atlas" << endl; cout << "RRR";mulset(RT,RT,RT); cout << "RRC";mulset(RT,RT,CT); cout << "RCR";mulset(RT,CT,RT); cout << "RCC";mulset(RT,CT,CT); cout << "CRR";mulset(CT,RT,RT); cout << "CRC";mulset(CT,RT,CT); cout << "CCR";mulset(CT,CT,RT); cout << "CCC";mulset(CT,CT,CT); } int main(int argc, char *argv[]) { int size1 = 100; if (argc > 1) size1 = ::atoi(argv [1]); int size2 = size1; if (argc > 2) size2 = ::atoi(argv [2]); int size3 = size2; if (argc > 3) size3 = ::atoi(argv [3]); test(size1, size2, size3); } // g++ -I $HOME/include/ -o sample83 sample83.cpp -DNDEBUG -L /usr/lib/3dnow -lcblas -latlas // g++-3.4 -I $HOME/include/ -o sample83 sample83.cpp -DNDEBUG -L /usr/lib/3dnow -lcblas -latlas -O3 -funroll-loops -march=athlon /* size of metrices - 400x400*400x400 prod axpy opb block goto atlas RRR 0.82 0.81 0.9 0.24 0.23 0.06 RRC 0.61 0.87 0.9 0.24 0.23 0.06 RCR 2.55 0.86 0.88 0.24 0.24 0.07 RCC 0.81 0.89 0.88 0.25 0.23 0.07 CRR 0.8 0.88 0.89 0.24 0.23 0.06 CRC 0.58 0.88 0.9 0.24 0.23 0.07 CCR 2.57 0.8 0.88 0.25 0.24 0.07 CCC 0.82 0.77 0.91 0.24 0.23 0.07 */