Boost logo

Ublas :

Subject: Re: [ublas] Matrix multiplication performance
From: palik imre (imre_palik_at_[hidden])
Date: 2016-02-11 02:18:19


Not on Haswell.

There only prefetching B in the macro kernel helps

On Wednesday, 10 February 2016, 15:50, Michael Lehn <michael.lehn_at_[hidden]> wrote:
Hi Palik,

sorry for the late response. It’s the last week of the semester so it is pretty busy … It is pretty
cool that prefetching improves the results so significantly. I will add this to the examples on my
page.

Today I added an example on how-to implement a fast LU factorization with ublas. The
implementation still needs polishing but the algorithms are pretty good:

    http://www.mathematik.uni-ulm.de/~lehn/test_ublas/session7/page01.html

Cheers,

Michael

On 03 Feb 2016, at 20:53, palik imre <imre_palik_at_[hidden]> wrote:

> Hi Michael,
>
> first of all, the link on your page to test_ublas.tgz is still pointing to session4.tgz
>
> I also realised, that we start to become memory bound.
> I am getting approximately 10% improvement with the following setup on my old AMD box:
>
>
> template <typename Index, typename T>
> typename std::enable_if<std::is_floating_point<T>::value,
> void>::type
> ugemm(Index kc, T alpha,
> const T *A, const T *B,
> T beta,
> T *C, Index incRowC, Index incColC)
> {
> A = (const T*) __builtin_assume_aligned (A, 128);
> B = (const T*) __builtin_assume_aligned (B, 128);
>
> static const unsigned div = 32/sizeof(T);
> static const Index MR = BlockSize<T>::MR;
> static const Index NR = BlockSize<T>::NR/div;
>
> typedef T vx __attribute__((vector_size (32)));
>
> vx P[MR*NR] __attribute__ ((aligned (128))) = {};
> const vx *B_ = (const vx *)B;
> for (Index l=0; l<kc; ++l) {
> if (!(kc&3))
> __builtin_prefetch(A + 16, 0);
> for (Index i=0; i<MR; ++i) {
> for (Index j=0; j<(NR); ++j) {
> P[i * NR + j] += A[i]*B_[j];
> }
> }
> A += MR;
> B_ += NR;
> }
>
> T *P_ = (T *)P;
> for (Index j=0; j<NR*div; ++j) {
> for (Index i=0; i<MR; ++i) {
> C[i*incRowC+j*incColC] *= beta;
> C[i*incRowC+j*incColC] += alpha*P_[i * NR * div + j];
> }
> }
> }
>
> template <typename Index, typename T, typename Beta, typename TC>
> void
> mgemm(Index mc, Index nc, Index kc,
> T alpha,
> const T *A, const T *B,
> Beta beta,
> TC *C, Index incRowC, Index incColC)
> {
> const Index MR = BlockSize<T>::MR;
> const Index NR = BlockSize<T>::NR;
> const Index mp = (mc+MR-1) / MR;
> const Index np = (nc+NR-1) / NR;
> const Index mr_ = mc % MR;
> const Index nr_ = nc % NR;
>
> #if defined(_OPENMP)
> #pragma omp parallel for
> #endif
> for (Index j=0; j<np; ++j) {
> const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
> T C_[BlockSize<T>::MR*BlockSize<T>::NR];
> __builtin_prefetch(B + j * kc * NR, 0);
> __builtin_prefetch(B + j * kc * NR + 16, 0);
> for (Index i=0; i<mp; ++i) {
> const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
> __builtin_prefetch(A + i * kc * MR, 0);
> if (mr==MR && nr==NR) {
> ugemm(kc, alpha,
> &A[i*kc*MR], &B[j*kc*NR],
> beta,
> &C[i*MR*incRowC+j*NR*incColC],
> incRowC, incColC);
> } else {
> std::fill_n(C_, MR*NR, T(0));
> ugemm(kc, alpha,
> &A[i*kc*MR], &B[j*kc*NR],
> T(0),
> C_, Index(1), MR);
> gescal(mr, nr, beta,
> &C[i*MR*incRowC+j*NR*incColC],
> incRowC, incColC);
> geaxpy(mr, nr, T(1), C_, Index(1), MR,
> &C[i*MR*incRowC+j*NR*incColC],
> incRowC, incColC);
> }
> }
> }
> }
>