<div dir="ltr"><p>I wrote a serial code for a matrix-vector multiplication in a 
Coordinate Storage Format(COO).  COO method has already made a big 
speed-up. My matrix is in a tridiagonal form.  I want  to parallelize 
the code and see how the parallelizm improves the code. My matrix could 
grow up to 2.000.000*2.000.000. Any efficient parallelization approach 
doesn&#39;t come to my mind (I have written some basic mpi programs but not 
much more. So my experience is limited.) Could you give me some 
headstart? <br></p>

<p>Thanks.</p><p>Özhan Fenerci</p><p><br></p><pre><code><font size="4">#include &lt;boost/numeric/ublas/banded.hpp&gt;
#include &lt;boost/numeric/ublas/io.hpp&gt;
#include &lt;vector&gt;
#include &lt;iostream&gt; 
#include &quot;mpi.h&quot;
#include &lt;stdio.h&gt;


int main(int argc, char *argv[] ) {

using namespace boost::numeric::ublas;
using std::cout;
using std::endl;

int numprocs, rank, chunk_size, i,j;
int max, mymax,rem;
int totalNonZeros; //totalNonZeros of TriDiagonal Matrix
MPI_Status status;
//b=A*x
banded_matrix&lt;double&gt; A (10, 10, 1, 1);
//vector&lt;double&gt; x(10); 
//vector&lt;double&gt; b(10); 
int* x = new int[10];
int* b = new int[10];    


/* Initialize MPI */
MPI_Init( &amp;argc,&amp;argv);
MPI_Comm_rank( MPI_COMM_WORLD, &amp;rank);
MPI_Comm_size( MPI_COMM_WORLD, &amp;numprocs);  
//MPI_Status status; 

if (rank == 0)  {
for (signed i = 0; i &lt; signed (A.size1 ()); ++ i) {
    for (signed j = std::max (i - 1, 0); j &lt; std::min (i + 2, signed (A.size2 ())); ++j)
        A (i, j) = i + j;
    x[i]= 1;
}       
}   


if(rank == 0) {
cout &lt;&lt; A &lt;&lt;endl&lt;&lt;endl;


totalNonZeros = 0;
for (signed i = 0; i &lt; signed (A.size1 ()); ++ i) {
    for (signed j = std::max (i - 1, 0); j &lt; std::min (i + 2, signed (A.size2 ())); ++ j)
        if( A(i,j) !=0) 
            totalNonZeros++;
}
}


cout&lt;&lt;&quot;Number ot NonZeros &quot;&lt;&lt;totalNonZeros&lt;&lt;endl;


if(rank == 0) {

     int * row_idx = new int[totalNonZeros]; // row index array
     int * col_idx = new int[totalNonZeros]; // col index array
     int * val = new int[totalNonZeros]; //val array    
     int index=0;

for (signed i = 0; i &lt; signed (A.size1 ()); ++ i) {
    for (signed j = std::max (i - 1, 0); j &lt; std::min (i + 2, signed (A.size2 ())); ++ j)
        if( A(i,j) !=0)   {
            row_idx[index] = i; 
            col_idx[index]=j;
            val[index] = A(i,j);  
            index++;              
        }
}
//Clear out result
for ( int k=0; k&lt;A.size1(); ++k)
{
    b[k]= 0;
}
//Matrix-vector Multiply
for (int index=0 ; index &lt;totalNonZeros;++index){
     b[row_idx[index]] = b[row_idx[index]]+ val[index]*x[col_idx[index]]; cout&lt;&lt;b[3]&lt;&lt;endl;
} 

//delete row_idx;
// delete col_idx;
//delete val;
}                       

MPI_Finalize();
return 0;                  </font></code></pre></div>