|
Ublas : |
Subject: Re: [ublas] Matrix multiplication performance
From: Nasos Iliopoulos (nasos_i_at_[hidden])
Date: 2016-02-10 10:02:31
That's great,
this is extremely useful
-N
On 02/10/2016 09:50 AM, Michael Lehn wrote:
> Hi Palik,
>
> sorry for the late response. Its the last week of the semester so it is pretty busy
It is pretty
> cool that prefetching improves the results so significantly. I will add this to the examples on my
> page.
>
> Today I added an example on how-to implement a fast LU factorization with ublas. The
> implementation still needs polishing but the algorithms are pretty good:
>
> http://www.mathematik.uni-ulm.de/~lehn/test_ublas/session7/page01.html
>
> Cheers,
>
> Michael
>
> On 03 Feb 2016, at 20:53, palik imre <imre_palik_at_[hidden]> wrote:
>
>> Hi Michael,
>>
>> first of all, the link on your page to test_ublas.tgz is still pointing to session4.tgz
>>
>> I also realised, that we start to become memory bound.
>> I am getting approximately 10% improvement with the following setup on my old AMD box:
>>
>>
>> template <typename Index, typename T>
>> typename std::enable_if<std::is_floating_point<T>::value,
>> void>::type
>> ugemm(Index kc, T alpha,
>> const T *A, const T *B,
>> T beta,
>> T *C, Index incRowC, Index incColC)
>> {
>> A = (const T*) __builtin_assume_aligned (A, 128);
>> B = (const T*) __builtin_assume_aligned (B, 128);
>>
>> static const unsigned div = 32/sizeof(T);
>> static const Index MR = BlockSize<T>::MR;
>> static const Index NR = BlockSize<T>::NR/div;
>>
>> typedef T vx __attribute__((vector_size (32)));
>>
>> vx P[MR*NR] __attribute__ ((aligned (128))) = {};
>> const vx *B_ = (const vx *)B;
>> for (Index l=0; l<kc; ++l) {
>> if (!(kc&3))
>> __builtin_prefetch(A + 16, 0);
>> for (Index i=0; i<MR; ++i) {
>> for (Index j=0; j<(NR); ++j) {
>> P[i * NR + j] += A[i]*B_[j];
>> }
>> }
>> A += MR;
>> B_ += NR;
>> }
>>
>> T *P_ = (T *)P;
>> for (Index j=0; j<NR*div; ++j) {
>> for (Index i=0; i<MR; ++i) {
>> C[i*incRowC+j*incColC] *= beta;
>> C[i*incRowC+j*incColC] += alpha*P_[i * NR * div + j];
>> }
>> }
>> }
>>
>> template <typename Index, typename T, typename Beta, typename TC>
>> void
>> mgemm(Index mc, Index nc, Index kc,
>> T alpha,
>> const T *A, const T *B,
>> Beta beta,
>> TC *C, Index incRowC, Index incColC)
>> {
>> const Index MR = BlockSize<T>::MR;
>> const Index NR = BlockSize<T>::NR;
>> const Index mp = (mc+MR-1) / MR;
>> const Index np = (nc+NR-1) / NR;
>> const Index mr_ = mc % MR;
>> const Index nr_ = nc % NR;
>>
>> #if defined(_OPENMP)
>> #pragma omp parallel for
>> #endif
>> for (Index j=0; j<np; ++j) {
>> const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
>> T C_[BlockSize<T>::MR*BlockSize<T>::NR];
>> __builtin_prefetch(B + j * kc * NR, 0);
>> __builtin_prefetch(B + j * kc * NR + 16, 0);
>> for (Index i=0; i<mp; ++i) {
>> const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
>> __builtin_prefetch(A + i * kc * MR, 0);
>> if (mr==MR && nr==NR) {
>> ugemm(kc, alpha,
>> &A[i*kc*MR], &B[j*kc*NR],
>> beta,
>> &C[i*MR*incRowC+j*NR*incColC],
>> incRowC, incColC);
>> } else {
>> std::fill_n(C_, MR*NR, T(0));
>> ugemm(kc, alpha,
>> &A[i*kc*MR], &B[j*kc*NR],
>> T(0),
>> C_, Index(1), MR);
>> gescal(mr, nr, beta,
>> &C[i*MR*incRowC+j*NR*incColC],
>> incRowC, incColC);
>> geaxpy(mr, nr, T(1), C_, Index(1), MR,
>> &C[i*MR*incRowC+j*NR*incColC],
>> incRowC, incColC);
>> }
>> }
>> }
>> }
>>
> _______________________________________________
> ublas mailing list
> ublas_at_[hidden]
> http://lists.boost.org/mailman/listinfo.cgi/ublas
> Sent to: athanasios.iliopoulos.ctr.gr_at_[hidden]
>