/***************************************************************************
    test_add.cpp : Adddition timing test
                             -------------------
    begin                : Sun 2007-01-21
    copyright            : (C) 2007 by Paul C. Leopardi
 ***************************************************************************
 *   This library is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU Lesser General Public License as        *
 *   published by the Free Software Foundation; either version 2.1 of the  *
 *   License, or (at your option) any later version.                       *
 *   See http://www.fsf.org/copyleft/lesser.html for details               *
 ***************************************************************************/

#include <boost/config.hpp>
#include <boost/numeric/ublas/fwd.hpp>

  namespace ublas = boost::numeric::ublas;

  namespace matrix
  {
    /// Kronecker tensor product of matrices - as per Matlab kron
    template< typename Matrix_T >
    const
    Matrix_T
    kron(const Matrix_T& lhs, const Matrix_T& rhs);

    /// Number of non-zeros
    template< typename Matrix_T >
    typename Matrix_T::size_type
    nnz(const Matrix_T& m);

    /// Unit matrix - as per Matlab eye
    template< typename Matrix_T >
    const
    Matrix_T
    unit(const typename Matrix_T::size_type n);

    /// Inner product: sum(x(i,j)*y(i,j))/x.nrows()
    template< typename Scalar_T, typename Matrix_T >
    Scalar_T
    inner(const Matrix_T& lhs, const Matrix_T& rhs);
  }

#include <boost/numeric/ublas/vector.hpp>
#include <boost/numeric/ublas/vector_proxy.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/matrix_sparse.hpp>
#include <boost/numeric/ublas/operation.hpp>
#include <boost/numeric/ublas/operation_sparse.hpp>

namespace matrix
{
  /// Kronecker tensor product of matrices - as per Matlab kron
  template< typename Matrix_T >
  const
  Matrix_T
  kron(const Matrix_T& lhs, const Matrix_T& rhs)
  {
    typedef typename Matrix_T::size_type matrix_index_t;
    const matrix_index_t lhs_s1 = lhs.size1();
    const matrix_index_t lhs_s2 = lhs.size2();
    const matrix_index_t rhs_s1 = rhs.size1();
    const matrix_index_t rhs_s2 = rhs.size2();
    Matrix_T result(lhs_s1*rhs_s1, lhs_s2*rhs_s2);
    typedef typename Matrix_T::value_type Scalar_T;
    typedef typename Matrix_T::const_iterator1 const_iterator1;
    typedef typename Matrix_T::const_iterator2 const_iterator2;
    for (const_iterator1
        lhs_it1 = lhs.begin1();
        lhs_it1 != lhs.end1();
        ++lhs_it1)
      for (const_iterator2
          lhs_it2 = lhs_it1.begin();
          lhs_it2 != lhs_it1.end();
          ++lhs_it2)
      {
        const matrix_index_t start1 = rhs_s1 * lhs_it2.index1();
        const matrix_index_t start2 = rhs_s2 * lhs_it2.index2();
        const Scalar_T& lhs_val = (*lhs_it2);
        for (const_iterator1
            rhs_it1 = rhs.begin1();
            rhs_it1 != rhs.end1();
            ++rhs_it1)
          for (const_iterator2
              rhs_it2 = rhs_it1.begin();
              rhs_it2 != rhs_it1.end();
              ++rhs_it2)
            result(start1 + rhs_it2.index1(), start2 + rhs_it2.index2()) = lhs_val * *rhs_it2;
      }
    return result;
  }

  /// Number of non-zeros
  template< typename Matrix_T >
  inline
  typename Matrix_T::size_type
  nnz(const Matrix_T& m)
  {
    typedef typename Matrix_T::size_type matrix_index_t;
    typedef typename Matrix_T::const_iterator1 const_iterator1;
    typedef typename Matrix_T::const_iterator2 const_iterator2;
    matrix_index_t result = 0;
    for (const_iterator1
        it1 = m.begin1();
        it1 != m.end1();
        ++it1)
      for (const_iterator2
          it2 = it1.begin();
          it2 != it1.end();
          ++it2)
        if (*it2 != 0)
          ++result;
    return result;
  }

  /// Inner product: sum(lhs(i,j)*rhs(i,j))/lhs.nrows()
  template< typename Scalar_T, typename Matrix_T >
  Scalar_T
  inner(const Matrix_T& lhs, const Matrix_T& rhs)
  {
    Scalar_T result = Scalar_T(0);
    for (typename Matrix_T::const_iterator1
        lhs_it1 = lhs.begin1();
        lhs_it1 != lhs.end1();
        ++lhs_it1)
      for (typename Matrix_T::const_iterator2
          lhs_it2 = lhs_it1.begin();
          lhs_it2 != lhs_it1.end();
          ++lhs_it2)
      {
        const Scalar_T& rhs_val = rhs(lhs_it2.index1(),lhs_it2.index2());
        if (rhs_val != Scalar_T(0))
          result += (*lhs_it2) * rhs_val;
      }
    return result / lhs.size1();
  }
}

#include <iomanip>

  typedef double Scalar_T;
  typedef ublas::row_major                           orientation_t;
  typedef ublas::compressed_matrix< Scalar_T, orientation_t >
                                                     matrix_t;
  typedef ublas::matrix< Scalar_T, orientation_t >   dense_matrix_t;
  typedef matrix_t::size_type                        matrix_index_t;

  using namespace std;
  const double MS_PER_S = 1000.0;
  const Scalar_T RAND_SCALE = 1.0/RAND_MAX;

  inline
  static
  double
  elapsed( clock_t cpu_time )
  { return ((clock() - cpu_time)*MS_PER_S) / CLOCKS_PER_SEC; }

  inline
  void
  print_time(matrix_index_t dim, const double add_cpu_time, 
             const double inn, const double fill)
  {
    const int dim_width = 6;
    cout << "Dim =" << setw(dim_width) << dim << ", "
         << "CPU = ";
    const ios::fmtflags& old_flags = cout.flags();
    const streamsize width = 12;
    const streamsize old_prec = cout.precision();
    const streamsize new_prec = 2;
    cout.setf(ios_base::fixed);
    cout.setf(ios_base::showpoint);
    cout << setprecision(new_prec)
         << setw(width) << add_cpu_time << "; "
         << setprecision(old_prec);
    cout.flags(old_flags);
    cout << "Inner product = " << inn << "; "
         << "Fill = " << fill
         << endl;
  }

  template< typename Matrix_T >
  static
  void
  time_add(const Matrix_T& a, const Matrix_T& b)
  {
    const int min_trials_if_zero = 1000;
    Matrix_T c = b;
    const double inn = matrix::inner<Scalar_T>(a, b);
    const double dim = a.size1();
    const double fill = max(matrix::nnz(a), matrix::nnz(b))*1.0/(dim*dim);
    clock_t cpu_time = clock();
     c += a;
    double add_cpu_time = elapsed(cpu_time);
    if (add_cpu_time < MS_PER_S)
    {
      const int max_trials =
       add_cpu_time == 0.0 ?
         min_trials_if_zero :
         int(floor(MS_PER_S/add_cpu_time));
      cpu_time = clock();
      for (int trials = 0; trials != max_trials; ++trials)
      {
        c = b;
        c += a;
      }
      add_cpu_time = elapsed(cpu_time)/max_trials;
      cpu_time = clock();
      for (int trials = 0; trials != max_trials; ++trials)
        c = b;
      add_cpu_time = std::max(0.0, add_cpu_time - elapsed(cpu_time)/max_trials);
    }
    print_time(a.size1(), add_cpu_time, inn, fill);
  }

  template< class Matrix_T >
  static
  void
  add_test(const int max_n)
  {
    cout << "Matrix addition test:" << endl;

    Matrix_T J = Matrix_T(2,2);
    J(0,1) = Scalar_T(-1);
    J(1,0) = Scalar_T( 1);
    Matrix_T K = Matrix_T(2,2);
    K(0,1) = Scalar_T( 1);
    K(1,0) = Scalar_T( 1);
    Matrix_T L = Matrix_T(2,2);
    L(0,0) = Scalar_T(-1);
    L(1,1) = Scalar_T( 1);
    const Matrix_T& dup = J + L;

    Matrix_T a = L;
    Matrix_T b = J;
    for (int n = 1; n != max_n; ++n)
    {
      a = matrix::kron(a,dup) * (rand()*RAND_SCALE);
      b = matrix::kron(b,dup) * (rand()*RAND_SCALE);
      time_add(a, b);
    }
  }

void do_test(const int max_n);

#include <stdio.h>
void do_test(const int max_n)
{
  cout << "Dense: ublas::matrix" << endl;
  add_test< dense_matrix_t >(max_n);
  cout << "Compressed: ublas::compressed_matrix" << endl;
  add_test< matrix_t >(max_n);
}

int main(int argc, char ** argv)
{
  for (argc--, argv++; argc != 0; argc--, argv++)
  {
    int n = 0;
    sscanf(*argv, "%d", &n);
    do_test(n);
  }
  return 0;
}