10#ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP 
   11#define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP 
   12#include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp" 
   13#include "Tpetra_RowMatrixTransposer.hpp" 
   17namespace MatrixMatrix {
 
   19namespace ExtraKernels {
 
   22template <
class CrsMatrixType>
 
   23size_t C_estimate_nnz_per_row(CrsMatrixType& A, CrsMatrixType& B) {
 
   25  size_t Aest = 100, Best = 100;
 
   26  if (A.getLocalNumEntries() > 0)
 
   27    Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 100;
 
   28  if (B.getLocalNumEntries() > 0)
 
   29    Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries() / B.getLocalNumRows() : 100;
 
   31  size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
 
   32  nnzperrow *= nnzperrow;
 
   38template <
class CrsMatrixType>
 
   39size_t Ac_estimate_nnz(CrsMatrixType& A, CrsMatrixType& P) {
 
   40  size_t nnzPerRowA = 100, Pcols = 100;
 
   41  if (A.getLocalNumEntries() > 0)
 
   42    nnzPerRowA = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 9;
 
   43  if (P.getLocalNumEntries() > 0)
 
   44    Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
 
   45  return (
size_t)(Pcols * nnzPerRowA + 5 * nnzPerRowA + 300);
 
   48#if defined(HAVE_TPETRA_INST_OPENMP) 
   50template <
class Scalar,
 
   53          class LocalOrdinalViewType>
 
   54void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
 
   55                                                 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
 
   56                                                 const LocalOrdinalViewType& targetMapToOrigRow,
 
   57                                                 const LocalOrdinalViewType& targetMapToImportRow,
 
   58                                                 const LocalOrdinalViewType& Bcol2Ccol,
 
   59                                                 const LocalOrdinalViewType& Icol2Ccol,
 
   60                                                 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
 
   61                                                 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
 
   62                                                 const std::string& label,
 
   63                                                 const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
   64#ifdef HAVE_TPETRA_MMM_TIMINGS 
   65  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
   66  using Teuchos::TimeMonitor;
 
   67  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix LTGCore"))));
 
   70  using Teuchos::ArrayRCP;
 
   71  using Teuchos::ArrayView;
 
   77  typedef typename KCRS::device_type device_t;
 
   78  typedef typename KCRS::StaticCrsGraphType graph_t;
 
   79  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
 
   80  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
 
   81  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
 
   82  typedef typename KCRS::values_type::non_const_type scalar_view_t;
 
   86  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
 
   87  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
 
   90  typedef LocalOrdinal LO;
 
   91  typedef GlobalOrdinal GO;
 
   92  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
 
   93  typedef Map<LO, GO, NO> map_type;
 
   98  typedef NO::execution_space execution_space;
 
   99  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
 
  102  const LO LO_INVALID  = Teuchos::OrdinalTraits<LO>::invalid();
 
  103  const SC SC_ZERO     = Teuchos::ScalarTraits<Scalar>::zero();
 
  104  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
 
  107  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
 
  108  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
 
  110  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
 
  111  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
 
  112  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
 
  113  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
 
  115  c_lno_view_t Irowptr;
 
  116  lno_nnz_view_t Icolind;
 
  118  if (!Bview.importMatrix.is_null()) {
 
  119    auto lclB         = Bview.importMatrix->getLocalMatrixDevice();
 
  120    Irowptr           = lclB.graph.row_map;
 
  121    Icolind           = lclB.graph.entries;
 
  123    b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
 
  127  RCP<const map_type> Ccolmap = C.getColMap();
 
  128  size_t m                    = Aview.origMatrix->getLocalNumRows();
 
  129  size_t n                    = Ccolmap->getLocalNumElements();
 
  130  size_t Cest_nnz_per_row     = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
 
  133  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
 
  134  if (!params.is_null()) {
 
  135    if (params->isParameter(
"openmp: ltg thread max"))
 
  136      thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
 
  140  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
 
  142  lno_nnz_view_t entriesC;
 
  143  scalar_view_t valuesC;
 
  147  lno_nnz_view_t thread_total_nnz(
"thread_total_nnz", thread_max + 1);
 
  150  Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
 
  151  Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
 
  153  double thread_chunk = (double)(m) / thread_max;
 
  156  Kokkos::parallel_for(
"MMM::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
 
  158    size_t my_thread_start = tid * thread_chunk;
 
  159    size_t my_thread_stop  = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
 
  160    size_t my_thread_m     = my_thread_stop - my_thread_start;
 
  163    size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
 
  166    std::vector<size_t> c_status(n, INVALID);
 
  168    u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
 
  169    u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
 
  172    size_t CSR_ip = 0, OLD_ip = 0;
 
  173    for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
 
  177      row_mapC(i) = CSR_ip;
 
  180      for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
 
  182        const SC Aval = Avals(k);    
 
  186        if (targetMapToOrigRow(Aik) != LO_INVALID) {
 
  193          size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
 
  196          for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
 
  198            LO Cij = Bcol2Ccol(Bkj);
 
  200            if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
 
  202              c_status[Cij]   = CSR_ip;
 
  203              Ccolind(CSR_ip) = Cij;
 
  204              Cvals(CSR_ip)   = Aval * Bvals(j);
 
  208              Cvals(c_status[Cij]) += Aval * Bvals(j);
 
  219          size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
 
  220          for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
 
  222            LO Cij = Icol2Ccol(Ikj);
 
  224            if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
 
  226              c_status[Cij]   = CSR_ip;
 
  227              Ccolind(CSR_ip) = Cij;
 
  228              Cvals(CSR_ip)   = Aval * Ivals(j);
 
  232              Cvals(c_status[Cij]) += Aval * Ivals(j);
 
  239      if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1)) * b_max_nnz_per_row) > CSR_alloc) {
 
  241        Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
 
  242        Cvals   = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
 
  246    thread_total_nnz(tid) = CSR_ip;
 
  247    tl_colind(tid)        = Ccolind;
 
  248    tl_values(tid)        = Cvals;
 
  252  copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
 
  254#ifdef HAVE_TPETRA_MMM_TIMINGS 
  256  MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix OpenMPSort"))));
 
  259  if (params.is_null() || params->get(
"sort entries", 
true))
 
  260    Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
 
  261  C.setAllValues(row_mapC, entriesC, valuesC);
 
  265template <
class Scalar,
 
  268          class LocalOrdinalViewType>
 
  269void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
 
  270                                             CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
 
  271                                             const LocalOrdinalViewType& targetMapToOrigRow,
 
  272                                             const LocalOrdinalViewType& targetMapToImportRow,
 
  273                                             const LocalOrdinalViewType& Bcol2Ccol,
 
  274                                             const LocalOrdinalViewType& Icol2Ccol,
 
  275                                             CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
 
  276                                             Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
 
  277                                             const std::string& label,
 
  278                                             const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  279#ifdef HAVE_TPETRA_MMM_TIMINGS 
  280  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  281  using Teuchos::TimeMonitor;
 
  282  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse LTGCore"))));
 
  285  using Teuchos::Array;
 
  286  using Teuchos::ArrayRCP;
 
  287  using Teuchos::ArrayView;
 
  294  typedef typename KCRS::StaticCrsGraphType graph_t;
 
  295  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
 
  296  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
 
  297  typedef typename KCRS::values_type::non_const_type scalar_view_t;
 
  300  typedef LocalOrdinal LO;
 
  301  typedef GlobalOrdinal GO;
 
  302  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
 
  303  typedef Map<LO, GO, NO> map_type;
 
  308  typedef NO::execution_space execution_space;
 
  309  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
 
  312  const LO LO_INVALID  = Teuchos::OrdinalTraits<LO>::invalid();
 
  313  const SC SC_ZERO     = Teuchos::ScalarTraits<Scalar>::zero();
 
  314  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
 
  317  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
 
  318  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
 
  319  const KCRS& Cmat = C.getLocalMatrixDevice();
 
  321  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
 
  322  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
 
  323  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
 
  324  scalar_view_t Cvals = Cmat.values;
 
  326  c_lno_view_t Irowptr;
 
  327  c_lno_nnz_view_t Icolind;
 
  329  if (!Bview.importMatrix.is_null()) {
 
  330    auto lclB = Bview.importMatrix->getLocalMatrixDevice();
 
  331    Irowptr   = lclB.graph.row_map;
 
  332    Icolind   = lclB.graph.entries;
 
  337  RCP<const map_type> Ccolmap = C.getColMap();
 
  338  size_t m                    = Aview.origMatrix->getLocalNumRows();
 
  339  size_t n                    = Ccolmap->getLocalNumElements();
 
  342  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
 
  343  if (!params.is_null()) {
 
  344    if (params->isParameter(
"openmp: ltg thread max"))
 
  345      thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
 
  348  double thread_chunk = (double)(m) / thread_max;
 
  351  Kokkos::parallel_for(
"MMM::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
 
  353    size_t my_thread_start = tid * thread_chunk;
 
  354    size_t my_thread_stop  = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
 
  357    std::vector<size_t> c_status(n, INVALID);
 
  360    size_t CSR_ip = 0, OLD_ip = 0;
 
  361    for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
 
  365      CSR_ip = Crowptr(i + 1);
 
  366      for (
size_t k = OLD_ip; k < CSR_ip; k++) {
 
  367        c_status[Ccolind(k)] = k;
 
  372      for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
 
  374        const SC Aval = Avals(k);
 
  378        if (targetMapToOrigRow(Aik) != LO_INVALID) {
 
  380          size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
 
  382          for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
 
  384            LO Cij = Bcol2Ccol(Bkj);
 
  386            TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
 
  387                                       std::runtime_error, 
"Trying to insert a new entry (" << i << 
"," << Cij << 
") into a static graph " 
  388                                                                                            << 
"(c_status = " << c_status[Cij] << 
" of [" << OLD_ip << 
"," << CSR_ip << 
"))");
 
  390            Cvals(c_status[Cij]) += Aval * Bvals(j);
 
  394          size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
 
  395          for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
 
  397            LO Cij = Icol2Ccol(Ikj);
 
  399            TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
 
  400                                       std::runtime_error, 
"Trying to insert a new entry (" << i << 
"," << Cij << 
") into a static graph " 
  401                                                                                            << 
"(c_status = " << c_status[Cij] << 
" of [" << OLD_ip << 
"," << CSR_ip << 
"))");
 
  403            Cvals(c_status[Cij]) += Aval * Ivals(j);
 
  414template <
class Scalar,
 
  417          class LocalOrdinalViewType>
 
  418void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
 
  419                                                   const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
 
  420                                                   CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
 
  421                                                   CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
 
  422                                                   const LocalOrdinalViewType& targetMapToOrigRow,
 
  423                                                   const LocalOrdinalViewType& targetMapToImportRow,
 
  424                                                   const LocalOrdinalViewType& Bcol2Ccol,
 
  425                                                   const LocalOrdinalViewType& Icol2Ccol,
 
  426                                                   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
 
  427                                                   Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
 
  428                                                   const std::string& label,
 
  429                                                   const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  430#ifdef HAVE_TPETRA_MMM_TIMINGS 
  431  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  432  using Teuchos::TimeMonitor;
 
  433  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix LTGCore"))));
 
  436  using Teuchos::Array;
 
  437  using Teuchos::ArrayRCP;
 
  438  using Teuchos::ArrayView;
 
  443  typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
 
  445  typedef typename KCRS::device_type device_t;
 
  446  typedef typename KCRS::StaticCrsGraphType graph_t;
 
  447  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
 
  448  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
 
  449  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
 
  450  typedef typename KCRS::values_type::non_const_type scalar_view_t;
 
  454  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
 
  455  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
 
  458  typedef typename scalar_view_t::memory_space scalar_memory_space;
 
  461  typedef LocalOrdinal LO;
 
  462  typedef GlobalOrdinal GO;
 
  464  typedef Map<LO, GO, NO> map_type;
 
  469  typedef NO::execution_space execution_space;
 
  470  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
 
  473  const LO LO_INVALID  = Teuchos::OrdinalTraits<LO>::invalid();
 
  474  const SC SC_ZERO     = Teuchos::ScalarTraits<Scalar>::zero();
 
  475  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
 
  478  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
 
  479  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
 
  481  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
 
  482  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
 
  483  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
 
  484  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
 
  486  c_lno_view_t Irowptr;
 
  487  lno_nnz_view_t Icolind;
 
  489  if (!Bview.importMatrix.is_null()) {
 
  490    auto lclB         = Bview.importMatrix->getLocalMatrixDevice();
 
  491    Irowptr           = lclB.graph.row_map;
 
  492    Icolind           = lclB.graph.entries;
 
  494    b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
 
  499      Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
 
  502  RCP<const map_type> Ccolmap = C.getColMap();
 
  503  size_t m                    = Aview.origMatrix->getLocalNumRows();
 
  504  size_t n                    = Ccolmap->getLocalNumElements();
 
  505  size_t Cest_nnz_per_row     = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
 
  508  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
 
  509  if (!params.is_null()) {
 
  510    if (params->isParameter(
"openmp: ltg thread max"))
 
  511      thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
 
  515  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
 
  517  lno_nnz_view_t entriesC;
 
  518  scalar_view_t valuesC;
 
  522  lno_nnz_view_t thread_total_nnz(
"thread_total_nnz", thread_max + 1);
 
  525  Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
 
  526  Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
 
  528  double thread_chunk = (double)(m) / thread_max;
 
  531  Kokkos::parallel_for(
"Jacobi::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
 
  533    size_t my_thread_start = tid * thread_chunk;
 
  534    size_t my_thread_stop  = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
 
  535    size_t my_thread_m     = my_thread_stop - my_thread_start;
 
  538    size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
 
  541    std::vector<size_t> c_status(n, INVALID);
 
  543    u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
 
  544    u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
 
  547    size_t CSR_ip = 0, OLD_ip = 0;
 
  548    for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
 
  553      row_mapC(i) = CSR_ip;
 
  555      SC minusOmegaDval = -omega * Dvals(i, 0);
 
  558      for (
size_t j = Browptr(i); j < Browptr(i + 1); j++) {
 
  559        const SC Bval = Bvals(j);
 
  563        LO Cij = Bcol2Ccol(Bij);
 
  566        c_status[Cij]   = CSR_ip;
 
  567        Ccolind(CSR_ip) = Cij;
 
  568        Cvals(CSR_ip)   = Bvals[j];
 
  574      for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
 
  576        const SC Aval = Avals(k);    
 
  580        if (targetMapToOrigRow(Aik) != LO_INVALID) {
 
  587          size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
 
  590          for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
 
  592            LO Cij = Bcol2Ccol(Bkj);
 
  594            if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
 
  596              c_status[Cij]   = CSR_ip;
 
  597              Ccolind(CSR_ip) = Cij;
 
  598              Cvals(CSR_ip)   = minusOmegaDval * Aval * Bvals(j);
 
  602              Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
 
  613          size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
 
  614          for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
 
  616            LO Cij = Icol2Ccol(Ikj);
 
  618            if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
 
  620              c_status[Cij]   = CSR_ip;
 
  621              Ccolind(CSR_ip) = Cij;
 
  622              Cvals(CSR_ip)   = minusOmegaDval * Aval * Ivals(j);
 
  626              Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
 
  633      if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1) + 1) * b_max_nnz_per_row) > CSR_alloc) {
 
  635        Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
 
  636        Cvals   = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
 
  640    thread_total_nnz(tid) = CSR_ip;
 
  641    tl_colind(tid)        = Ccolind;
 
  642    tl_values(tid)        = Cvals;
 
  646  copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
 
  648#ifdef HAVE_TPETRA_MMM_TIMINGS 
  650  MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix OpenMPSort"))));
 
  653  if (params.is_null() || params->get(
"sort entries", 
true))
 
  654    Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
 
  655  C.setAllValues(row_mapC, entriesC, valuesC);
 
  659template <
class Scalar,
 
  662          class LocalOrdinalViewType>
 
  663void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
 
  664                                               const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
 
  665                                               CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
 
  666                                               CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
 
  667                                               const LocalOrdinalViewType& targetMapToOrigRow,
 
  668                                               const LocalOrdinalViewType& targetMapToImportRow,
 
  669                                               const LocalOrdinalViewType& Bcol2Ccol,
 
  670                                               const LocalOrdinalViewType& Icol2Ccol,
 
  671                                               CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
 
  672                                               Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
 
  673                                               const std::string& label,
 
  674                                               const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  675#ifdef HAVE_TPETRA_MMM_TIMINGS 
  676  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  677  using Teuchos::TimeMonitor;
 
  678  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse LTGCore"))));
 
  680  using Teuchos::Array;
 
  681  using Teuchos::ArrayRCP;
 
  682  using Teuchos::ArrayView;
 
  687  typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
 
  690  typedef typename KCRS::StaticCrsGraphType graph_t;
 
  691  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
 
  692  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
 
  693  typedef typename KCRS::values_type::non_const_type scalar_view_t;
 
  696  typedef typename scalar_view_t::memory_space scalar_memory_space;
 
  699  typedef LocalOrdinal LO;
 
  700  typedef GlobalOrdinal GO;
 
  702  typedef Map<LO, GO, NO> map_type;
 
  707  typedef NO::execution_space execution_space;
 
  708  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
 
  711  const LO LO_INVALID  = Teuchos::OrdinalTraits<LO>::invalid();
 
  712  const SC SC_ZERO     = Teuchos::ScalarTraits<Scalar>::zero();
 
  713  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
 
  716  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
 
  717  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
 
  718  const KCRS& Cmat = C.getLocalMatrixDevice();
 
  720  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
 
  721  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
 
  722  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
 
  723  scalar_view_t Cvals = Cmat.values;
 
  725  c_lno_view_t Irowptr;
 
  726  c_lno_nnz_view_t Icolind;
 
  728  if (!Bview.importMatrix.is_null()) {
 
  729    auto lclB = Bview.importMatrix->getLocalMatrixDevice();
 
  730    Irowptr   = lclB.graph.row_map;
 
  731    Icolind   = lclB.graph.entries;
 
  737      Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
 
  740  RCP<const map_type> Ccolmap = C.getColMap();
 
  741  size_t m                    = Aview.origMatrix->getLocalNumRows();
 
  742  size_t n                    = Ccolmap->getLocalNumElements();
 
  745  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
 
  746  if (!params.is_null()) {
 
  747    if (params->isParameter(
"openmp: ltg thread max"))
 
  748      thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
 
  751  double thread_chunk = (double)(m) / thread_max;
 
  754  Kokkos::parallel_for(
"Jacobi::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
 
  756    size_t my_thread_start = tid * thread_chunk;
 
  757    size_t my_thread_stop  = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
 
  760    std::vector<size_t> c_status(n, INVALID);
 
  763    size_t CSR_ip = 0, OLD_ip = 0;
 
  764    for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
 
  768      CSR_ip = Crowptr(i + 1);
 
  770      SC minusOmegaDval = -omega * Dvals(i, 0);
 
  772      for (
size_t k = OLD_ip; k < CSR_ip; k++) {
 
  773        c_status[Ccolind(k)] = k;
 
  779      for (
size_t j = Browptr(i); j < Browptr(i + 1); j++) {
 
  780        const SC Bval = Bvals(j);
 
  784        LO Cij = Bcol2Ccol(Bij);
 
  787        Cvals(c_status[Cij]) += Bvals(j);
 
  791      for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
 
  793        const SC Aval = Avals(k);
 
  797        if (targetMapToOrigRow(Aik) != LO_INVALID) {
 
  799          size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
 
  801          for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
 
  803            LO Cij = Bcol2Ccol(Bkj);
 
  805            TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
 
  806                                       std::runtime_error, 
"Trying to insert a new entry (" << i << 
"," << Cij << 
") into a static graph " 
  807                                                                                            << 
"(c_status = " << c_status[Cij] << 
" of [" << OLD_ip << 
"," << CSR_ip << 
"))");
 
  809            Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
 
  813          size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
 
  814          for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
 
  816            LO Cij = Icol2Ccol(Ikj);
 
  818            TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
 
  819                                       std::runtime_error, 
"Trying to insert a new entry (" << i << 
"," << Cij << 
") into a static graph " 
  820                                                                                            << 
"(c_status = " << c_status[Cij] << 
" of [" << OLD_ip << 
"," << CSR_ip << 
"))");
 
  822            Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
 
  833template <
class InColindArrayType, 
class InValsArrayType,
 
  834          class OutRowptrType, 
class OutColindType, 
class OutValsType>
 
  835void copy_out_from_thread_memory(
const OutColindType& thread_total_nnz,
 
  836                                 const InColindArrayType& Incolind,
 
  837                                 const InValsArrayType& Invalues,
 
  839                                 const double thread_chunk,
 
  840                                 OutRowptrType& row_mapC,
 
  841                                 OutColindType& entriesC,
 
  842                                 OutValsType& valuesC) {
 
  843  typedef OutRowptrType lno_view_t;
 
  844  typedef OutColindType lno_nnz_view_t;
 
  845  typedef OutValsType scalar_view_t;
 
  846  typedef typename lno_view_t::execution_space execution_space;
 
  847  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
 
  850  size_t thread_max = Incolind.size();
 
  851  size_t c_nnz_size = 0;
 
  856  lno_view_t thread_start_nnz(
"thread_nnz", thread_max + 1);
 
  857  Kokkos::parallel_scan(
"LTG::Scan", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t i, 
size_t& update, 
const bool final) {
 
  858    size_t mynnz = thread_total_nnz(i);
 
  859    if (
final) thread_start_nnz(i) = update;
 
  861    if (
final && i + 1 == thread_max) thread_start_nnz(i + 1) = update;
 
  863  c_nnz_size = thread_start_nnz(thread_max);
 
  866  row_mapC(m) = thread_start_nnz(thread_max);
 
  869  lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
 
  870  entriesC = entriesC_;
 
  871  scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
 
  875  Kokkos::parallel_for(
"LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
 
  876    const size_t my_thread_start  = tid * thread_chunk;
 
  877    const size_t my_thread_stop   = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
 
  878    const size_t nnz_thread_start = thread_start_nnz(tid);
 
  880    const size_t nnz_thread_stop = thread_start_nnz(tid + 1);
 
  891    if (my_thread_start != 0) {
 
  892      for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
 
  893        row_mapC(i) += nnz_thread_start;
 
  909    const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
 
  910    for (
size_t i = 0; i < my_num_nnz; ++i) {
 
  911      entriesC(nnz_thread_start + i) = Incolind(tid)(i);
 
  912      valuesC(nnz_thread_start + i)  = Invalues(tid)(i);
 
  917    if (Incolind(tid).data()) free(Incolind(tid).data());
 
  918    if (Invalues(tid).data()) free(Invalues(tid).data());
 
  925template <
class Scalar, 
class LocalOrdinal, 
class GlobalOrdinal, 
class Node, 
class LocalOrdinalViewType>
 
  926void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
 
  927                                                 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
 
  928                                                 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
 
  929                                                 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
 
  930                                                 const LocalOrdinalViewType& Acol2Brow,
 
  931                                                 const LocalOrdinalViewType& Acol2Irow,
 
  932                                                 const LocalOrdinalViewType& Bcol2Ccol,
 
  933                                                 const LocalOrdinalViewType& Icol2Ccol,
 
  934                                                 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
 
  935                                                 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Cimport,
 
  936                                                 const std::string& label,
 
  937                                                 const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  938#ifdef HAVE_TPETRA_MMM_TIMINGS 
  939  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  940  using Teuchos::TimeMonitor;
 
  941  Teuchos::RCP<TimeMonitor> MM  = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK"))));
 
  942  Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Multiply"))));
 
  945  typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
 
  950  Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(
new Matrix_t(C.getRowMap(), 0));
 
  951  Tpetra::MMdetails::mult_A_B_newmatrix(Aview, Bview, *AB, label + std::string(
" MSAK"), params);
 
  953#ifdef HAVE_TPETRA_MMM_TIMINGS 
  955  MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Scale"))));
 
  961#ifdef HAVE_TPETRA_MMM_TIMINGS 
  963  MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Add"))));
 
  967  Teuchos::ParameterList jparams;
 
  968  if (!params.is_null()) {
 
  970    jparams.set(
"label", label + std::string(
" MSAK Add"));
 
  972  Scalar one = Teuchos::ScalarTraits<Scalar>::one();
 
  973  Tpetra::MatrixMatrix::add(one, 
false, *Bview.origMatrix, Scalar(-omega), 
false, *AB, C, AB->getDomainMap(), AB->getRangeMap(), Teuchos::rcp(&jparams, 
false));
 
  974#ifdef HAVE_TPETRA_MMM_TIMINGS 
  979#if defined(HAVE_TPETRA_INST_OPENMP) 
  981template <
class Scalar, 
class LocalOrdinal, 
class GlobalOrdinal, 
class LocalOrdinalViewType>
 
  982static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
 
  983                                                                 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
 
  984                                                                 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
 
  985                                                                 const LocalOrdinalViewType& Acol2PRow,
 
  986                                                                 const LocalOrdinalViewType& Acol2PRowImport,
 
  987                                                                 const LocalOrdinalViewType& Pcol2Accol,
 
  988                                                                 const LocalOrdinalViewType& PIcol2Accol,
 
  989                                                                 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
 
  990                                                                 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
 
  991                                                                 const std::string& label,
 
  992                                                                 const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  994  using Tpetra::MatrixMatrix::UnmanagedView;
 
  995#ifdef HAVE_TPETRA_MMM_TIMINGS 
  996  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  998  using Teuchos::TimeMonitor;
 
  999  RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix LTGCore"))));
 
 1002  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
 
 1004  typedef LocalOrdinal LO;
 
 1005  typedef GlobalOrdinal GO;
 
 1007  typedef Map<LO, GO, NO> map_type;
 
 1009  typedef typename KCRS::StaticCrsGraphType graph_t;
 
 1010  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
 
 1011  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
 
 1012  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
 
 1013  typedef typename KCRS::values_type::non_const_type scalar_view_t;
 
 1014  typedef typename KCRS::device_type device_t;
 
 1015  typedef typename device_t::execution_space execution_space;
 
 1016  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
 
 1019  typedef UnmanagedView<lno_view_t> u_lno_view_t;
 
 1020  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
 
 1021  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
 
 1023  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
 
 1024  const SC SC_ZERO    = Teuchos::ScalarTraits<Scalar>::zero();
 
 1027  RCP<const map_type> Accolmap = Ac.getColMap();
 
 1028  size_t m                     = Rview.origMatrix->getLocalNumRows();
 
 1029  size_t n                     = Accolmap->getLocalNumElements();
 
 1032  const KCRS& Rmat = Rview.origMatrix->getLocalMatrixDevice();
 
 1033  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
 
 1034  const KCRS& Pmat = Pview.origMatrix->getLocalMatrixDevice();
 
 1036  c_lno_view_t Rrowptr         = Rmat.graph.row_map,
 
 1037               Arowptr         = Amat.graph.row_map,
 
 1038               Prowptr         = Pmat.graph.row_map, Irowptr;
 
 1039  const lno_nnz_view_t Rcolind = Rmat.graph.entries,
 
 1040                       Acolind = Amat.graph.entries,
 
 1041                       Pcolind = Pmat.graph.entries;
 
 1042  lno_nnz_view_t Icolind;
 
 1043  const scalar_view_t Rvals = Rmat.values,
 
 1044                      Avals = Amat.values,
 
 1045                      Pvals = Pmat.values;
 
 1046  scalar_view_t Ivals;
 
 1048  if (!Pview.importMatrix.is_null()) {
 
 1049    const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
 
 1050    Irowptr          = Imat.graph.row_map;
 
 1051    Icolind          = Imat.graph.entries;
 
 1052    Ivals            = Imat.values;
 
 1063  size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
 
 1064  size_t INVALID           = Teuchos::OrdinalTraits<size_t>::invalid();
 
 1067  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
 
 1068  if (!params.is_null()) {
 
 1069    if (params->isParameter(
"openmp: ltg thread max"))
 
 1070      thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
 
 1073  double thread_chunk = (double)(m) / thread_max;
 
 1076  lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
 
 1078  lno_nnz_view_t entriesAc;
 
 1079  scalar_view_t valuesAc;
 
 1083  lno_nnz_view_t thread_total_nnz(
"thread_total_nnz", thread_max + 1);
 
 1093  Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
 
 1094  Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
 
 1097  Kokkos::parallel_for(
"MMM::RAP::NewMatrix::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
 
 1099    size_t my_thread_start = tid * thread_chunk;
 
 1100    size_t my_thread_stop  = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
 
 1101    size_t my_thread_m     = my_thread_stop - my_thread_start;
 
 1103    size_t nnzAllocated = (size_t)(my_thread_m * Acest_nnz_per_row + 100);
 
 1105    std::vector<size_t> ac_status(n, INVALID);
 
 1108    u_lno_view_t Acrowptr((
typename u_lno_view_t::data_type)malloc(u_lno_view_t::shmem_size(my_thread_m + 1)), my_thread_m + 1);
 
 1109    u_lno_nnz_view_t Accolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
 
 1110    u_scalar_view_t Acvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
 
 1113    size_t nnz = 0, nnz_old = 0;
 
 1115    for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
 
 1119      for (
size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
 
 1121        const SC Rik = Rvals(kk);    
 
 1125        for (
size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
 
 1127          const SC Akl = Avals(ll);    
 
 1131          if (Acol2PRow[l] != LO_INVALID) {
 
 1138            size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
 
 1141            for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
 
 1143              LO Acj = Pcol2Accol(j);
 
 1146              if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
 
 1147#ifdef HAVE_TPETRA_DEBUG 
 1149                TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
 
 1151                                           label << 
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
 
 1154                ac_status[Acj] = nnz;
 
 1155                Accolind(nnz)  = Acj;
 
 1156                Acvals(nnz)    = Rik * Akl * Plj;
 
 1159                Acvals(ac_status[Acj]) += Rik * Akl * Plj;
 
 1169            size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
 
 1170            for (
size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
 
 1172              LO Acj = PIcol2Accol(j);
 
 1175              if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
 
 1176#ifdef HAVE_TPETRA_DEBUG 
 1178                TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
 
 1180                                           label << 
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
 
 1183                ac_status[Acj] = nnz;
 
 1184                Accolind(nnz)  = Acj;
 
 1185                Acvals(nnz)    = Rik * Akl * Plj;
 
 1188                Acvals(ac_status[Acj]) += Rik * Akl * Plj;
 
 1195      if (nnz + n > nnzAllocated) {
 
 1197        Accolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
 
 1198        Acvals   = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*)Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
 
 1202    thread_total_nnz(tid) = nnz;
 
 1203    tl_colind(tid)        = Accolind;
 
 1204    tl_values(tid)        = Acvals;
 
 1206#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1208  MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix copy from thread local"))));
 
 1211  copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
 
 1213#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1215  MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
 
 1219  Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
 
 1221  Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
 
 1223#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1225  MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
 
 1236  RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
 
 1237  labelList->set(
"Timer Label", label);
 
 1238  if (!params.is_null()) labelList->set(
"compute global constants", params->get(
"compute global constants", 
true));
 
 1239  RCP<const Export<LO, GO, NO> > dummyExport;
 
 1240  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
 
 1241                              Rview.origMatrix->getRangeMap(),
 
 1248template <
class Scalar, 
class LocalOrdinal, 
class GlobalOrdinal, 
class LocalOrdinalViewType>
 
 1249static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
 
 1250                                                             CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
 
 1251                                                             CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
 
 1252                                                             const LocalOrdinalViewType& Acol2PRow,
 
 1253                                                             const LocalOrdinalViewType& Acol2PRowImport,
 
 1254                                                             const LocalOrdinalViewType& Pcol2Accol,
 
 1255                                                             const LocalOrdinalViewType& PIcol2Accol,
 
 1256                                                             CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
 
 1257                                                             Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
 
 1258                                                             const std::string& label,
 
 1259                                                             const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
 1261  using Tpetra::MatrixMatrix::UnmanagedView;
 
 1262#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1263  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
 1265  using Teuchos::TimeMonitor;
 
 1266  RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse LTGCore"))));
 
 1269  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
 
 1271  typedef LocalOrdinal LO;
 
 1272  typedef GlobalOrdinal GO;
 
 1274  typedef Map<LO, GO, NO> map_type;
 
 1276  typedef typename KCRS::StaticCrsGraphType graph_t;
 
 1277  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
 
 1278  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
 
 1279  typedef typename KCRS::values_type::non_const_type scalar_view_t;
 
 1280  typedef typename KCRS::device_type device_t;
 
 1281  typedef typename device_t::execution_space execution_space;
 
 1282  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
 
 1284  const LO LO_INVALID  = Teuchos::OrdinalTraits<LO>::invalid();
 
 1285  const SC SC_ZERO     = Teuchos::ScalarTraits<Scalar>::zero();
 
 1286  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
 
 1289  RCP<const map_type> Accolmap = Ac.getColMap();
 
 1290  size_t m                     = Rview.origMatrix->getLocalNumRows();
 
 1291  size_t n                     = Accolmap->getLocalNumElements();
 
 1294  const KCRS& Rmat = Rview.origMatrix->getLocalMatrixDevice();
 
 1295  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
 
 1296  const KCRS& Pmat = Pview.origMatrix->getLocalMatrixDevice();
 
 1297  const KCRS& Cmat = Ac.getLocalMatrixDevice();
 
 1299  c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
 
 1300  const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
 
 1301  lno_nnz_view_t Icolind;
 
 1302  const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
 
 1303  scalar_view_t Cvals = Cmat.values;
 
 1304  scalar_view_t Ivals;
 
 1306  if (!Pview.importMatrix.is_null()) {
 
 1307    const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
 
 1308    Irowptr          = Imat.graph.row_map;
 
 1309    Icolind          = Imat.graph.entries;
 
 1310    Ivals            = Imat.values;
 
 1314  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
 
 1315  if (!params.is_null()) {
 
 1316    if (params->isParameter(
"openmp: ltg thread max"))
 
 1317      thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
 
 1320  double thread_chunk = (double)(m) / thread_max;
 
 1323  Kokkos::parallel_for(
"MMM::RAP::Reuse::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
 
 1325    size_t my_thread_start = tid * thread_chunk;
 
 1326    size_t my_thread_stop  = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
 
 1328    std::vector<size_t> c_status(n, INVALID);
 
 1331    size_t OLD_ip = 0, CSR_ip = 0;
 
 1333    for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
 
 1336      OLD_ip = Crowptr(i);
 
 1337      CSR_ip = Crowptr(i + 1);
 
 1338      for (
size_t k = OLD_ip; k < CSR_ip; k++) {
 
 1339        c_status[Ccolind(k)] = k;
 
 1345      for (
size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
 
 1347        const SC Rik = Rvals(kk);    
 
 1351        for (
size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
 
 1353          const SC Akl = Avals(ll);    
 
 1357          if (Acol2PRow[l] != LO_INVALID) {
 
 1364            size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
 
 1367            for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
 
 1369              LO Cij = Pcol2Accol(j);
 
 1371              TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
 
 1372                                         std::runtime_error, 
"Trying to insert a new entry (" << i << 
"," << Cij << 
") into a static graph " 
 1373                                                                                              << 
"(c_status = " << c_status[Cij] << 
" of [" << OLD_ip << 
"," << CSR_ip << 
"))");
 
 1375              Cvals(c_status[Cij]) += Rik * Akl * Plj;
 
 1384            size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
 
 1385            for (
size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
 
 1387              LO Cij = PIcol2Accol(j);
 
 1389              TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
 
 1390                                         std::runtime_error, 
"Trying to insert a new entry (" << i << 
"," << Cij << 
") into a static graph " 
 1391                                                                                              << 
"(c_status = " << c_status[Cij] << 
" of [" << OLD_ip << 
"," << CSR_ip << 
"))");
 
 1393              Cvals(c_status[Cij]) += Rik * Akl * Plj;
 
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
 
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
 
Namespace Tpetra contains the class and methods constituting the Tpetra library.