10#ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP 
   11#define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP 
   14#include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"   
   15#include "Teuchos_VerboseObject.hpp" 
   16#include "Teuchos_Array.hpp" 
   18#include "Tpetra_ConfigDefs.hpp" 
   19#include "Tpetra_CrsMatrix.hpp" 
   21#include "Tpetra_RowMatrixTransposer.hpp" 
   22#include "Tpetra_ConfigDefs.hpp" 
   23#include "Tpetra_Map.hpp" 
   24#include "Tpetra_Export.hpp" 
   29#include "Teuchos_FancyOStream.hpp" 
   40#include "TpetraExt_MatrixMatrix_OpenMP.hpp" 
   41#include "TpetraExt_MatrixMatrix_Cuda.hpp" 
   42#include "TpetraExt_MatrixMatrix_HIP.hpp" 
   43#include "TpetraExt_MatrixMatrix_SYCL.hpp" 
   47namespace TripleMatrixMultiply {
 
   55template <
class Scalar,
 
   68    const std::string& label,
 
   69    const Teuchos::RCP<Teuchos::ParameterList>& 
params) {
 
   83#ifdef HAVE_TPETRA_MMM_TIMINGS 
   84  std::string 
prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
   85  using Teuchos::TimeMonitor;
 
   89  const std::string 
prefix = 
"TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
 
  119  const bool newFlag = !
Ac.getGraph()->isLocallyIndexed() && !
Ac.getGraph()->isGloballyIndexed();
 
  121  using Teuchos::ParameterList;
 
  157                             prefix << 
"ERROR, inner dimensions of op(R) and op(A) " 
  158                                       "must match for matrix-matrix product. op(R) is " 
  162                             prefix << 
"ERROR, inner dimensions of op(A) and op(P) " 
  163                                       "must match for matrix-matrix product. op(A) is " 
  171                             prefix << 
"ERROR, dimensions of result Ac must " 
  172                                       "match dimensions of op(R) * op(A) * op(P). Ac has " 
  173                                    << 
Ac.getGlobalNumRows()
 
  174                                    << 
" rows, should have at least " << 
Rleft << std::endl);
 
  198#ifdef HAVE_TPETRA_MMM_TIMINGS 
  229#ifdef HAVE_TPETRA_MMM_TIMINGS 
  274#ifdef HAVE_TPETRA_MMM_TIMINGS 
  295      labelList.set(
"compute global constants", 
params->get(
"compute global constants", 
true));
 
  300                          "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
 
  303    export_type 
exporter = export_type(*
Pprime->getGraph()->getImporter());
 
  310#ifdef HAVE_TPETRA_MMM_STATISTICS 
 
  320template <
class Scalar,
 
  324void mult_R_A_P_newmatrix(
 
  325    CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
 
  326    CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
 
  327    CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
 
  328    CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
 
  329    const std::string& label,
 
  330    const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  331  using Teuchos::Array;
 
  332  using Teuchos::ArrayRCP;
 
  333  using Teuchos::ArrayView;
 
  338  typedef LocalOrdinal LO;
 
  339  typedef GlobalOrdinal GO;
 
  342  typedef Import<LO, GO, NO> import_type;
 
  343  typedef Map<LO, GO, NO> map_type;
 
  346  typedef typename map_type::local_map_type local_map_type;
 
  348  typedef typename KCRS::StaticCrsGraphType graph_t;
 
  349  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
 
  350  typedef typename NO::execution_space execution_space;
 
  351  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
 
  352  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
 
  354#ifdef HAVE_TPETRA_MMM_TIMINGS 
  355  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  356  using Teuchos::TimeMonitor;
 
  357  RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
 
  359  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
 
  362  RCP<const import_type> Cimport;
 
  363  RCP<const map_type> Ccolmap;
 
  364  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
 
  365  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
 
  366  local_map_type Acolmap_local   = Aview.colMap->getLocalMap();
 
  367  local_map_type Prowmap_local   = Pview.origMatrix->getRowMap()->getLocalMap();
 
  368  local_map_type Irowmap_local;
 
  369  if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
 
  370  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
 
  371  local_map_type Icolmap_local;
 
  372  if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
 
  379  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Pcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
 
  381  if (Pview.importMatrix.is_null()) {
 
  384    Ccolmap             = Pview.colMap;
 
  385    const LO colMapSize = 
static_cast<LO
>(Pview.colMap->getLocalNumElements());
 
  387    Kokkos::parallel_for(
 
  388        "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
 
  389        Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
 
  390        KOKKOS_LAMBDA(
const LO i) {
 
  402    if (!Pimport.is_null() && !Iimport.is_null()) {
 
  403      Cimport = Pimport->setUnion(*Iimport);
 
  404    } 
else if (!Pimport.is_null() && Iimport.is_null()) {
 
  405      Cimport = Pimport->setUnion();
 
  406    } 
else if (Pimport.is_null() && !Iimport.is_null()) {
 
  407      Cimport = Iimport->setUnion();
 
  409      throw std::runtime_error(
"TpetraExt::RAP status of matrix importers is nonsensical");
 
  411    Ccolmap = Cimport->getTargetMap();
 
  416    TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
 
  417                               std::runtime_error, 
"Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
 
  424    Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
 
  425    local_map_type Ccolmap_local = Ccolmap->getLocalMap();
 
  426    Kokkos::parallel_for(
 
  427        "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement", range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
 
  428          Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
 
  430    Kokkos::parallel_for(
 
  431        "Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement", range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
 
  432          Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
 
  440  Ac.replaceColMap(Ccolmap);
 
  458  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
 
  459  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"), Aview.colMap->getLocalNumElements());
 
  460  Kokkos::parallel_for(
 
  461      "Tpetra::mult_R_A_P_newmatrix::construct_tables", range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(
const LO i) {
 
  462        GO aidx  = Acolmap_local.getGlobalElement(i);
 
  463        LO P_LID = Prowmap_local.getLocalElement(aidx);
 
  464        if (P_LID != LO_INVALID) {
 
  465          targetMapToOrigRow(i)   = P_LID;
 
  466          targetMapToImportRow(i) = LO_INVALID;
 
  468          LO I_LID                = Irowmap_local.getLocalElement(aidx);
 
  469          targetMapToOrigRow(i)   = LO_INVALID;
 
  470          targetMapToImportRow(i) = I_LID;
 
  476  KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
 
  477      mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
 
  478                                          targetMapToOrigRow, targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
 
  479                                          Ac, Cimport, label, params);
 
  483template <
class Scalar,
 
  487void mult_R_A_P_reuse(
 
  488    CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
 
  489    CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
 
  490    CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
 
  491    CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
 
  492    const std::string& label,
 
  493    const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  494  using Teuchos::Array;
 
  495  using Teuchos::ArrayRCP;
 
  496  using Teuchos::ArrayView;
 
  501  typedef LocalOrdinal LO;
 
  502  typedef GlobalOrdinal GO;
 
  505  typedef Import<LO, GO, NO> import_type;
 
  506  typedef Map<LO, GO, NO> map_type;
 
  509  typedef typename map_type::local_map_type local_map_type;
 
  511  typedef typename KCRS::StaticCrsGraphType graph_t;
 
  512  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
 
  513  typedef typename NO::execution_space execution_space;
 
  514  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
 
  515  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
 
  517#ifdef HAVE_TPETRA_MMM_TIMINGS 
  518  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  519  using Teuchos::TimeMonitor;
 
  520  RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
 
  522  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
 
  525  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
 
  526  RCP<const map_type> Ccolmap    = Ac.getColMap();
 
  527  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
 
  528  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
 
  529  local_map_type Acolmap_local   = Aview.colMap->getLocalMap();
 
  530  local_map_type Prowmap_local   = Pview.origMatrix->getRowMap()->getLocalMap();
 
  531  local_map_type Irowmap_local;
 
  532  if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
 
  533  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
 
  534  local_map_type Icolmap_local;
 
  535  if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
 
  536  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
 
  539  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
 
  543    Kokkos::parallel_for(
 
  544        range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
 
  545          Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
 
  548    if (!Pview.importMatrix.is_null()) {
 
  549      TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
 
  550                                 std::runtime_error, 
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
 
  552      Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
 
  553      Kokkos::parallel_for(
 
  554          range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
 
  555            Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
 
  561  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
 
  562  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"), Aview.colMap->getLocalNumElements());
 
  563  Kokkos::parallel_for(
 
  564      range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(
const LO i) {
 
  565        GO aidx  = Acolmap_local.getGlobalElement(i);
 
  566        LO B_LID = Prowmap_local.getLocalElement(aidx);
 
  567        if (B_LID != LO_INVALID) {
 
  568          targetMapToOrigRow(i)   = B_LID;
 
  569          targetMapToImportRow(i) = LO_INVALID;
 
  571          LO I_LID                = Irowmap_local.getLocalElement(aidx);
 
  572          targetMapToOrigRow(i)   = LO_INVALID;
 
  573          targetMapToImportRow(i) = I_LID;
 
  579  KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
 
  580      mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
 
  581                                      targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
 
  582                                      Ac, Cimport, label, params);
 
  586template <
class Scalar,
 
  590void mult_PT_A_P_newmatrix(
 
  591    CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
 
  592    CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
 
  593    CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
 
  594    const std::string& label,
 
  595    const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  596  using Teuchos::Array;
 
  597  using Teuchos::ArrayRCP;
 
  598  using Teuchos::ArrayView;
 
  603  typedef LocalOrdinal LO;
 
  604  typedef GlobalOrdinal GO;
 
  607  typedef Import<LO, GO, NO> import_type;
 
  608  typedef Map<LO, GO, NO> map_type;
 
  611  typedef typename map_type::local_map_type local_map_type;
 
  613  typedef typename KCRS::StaticCrsGraphType graph_t;
 
  614  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
 
  615  typedef typename NO::execution_space execution_space;
 
  616  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
 
  617  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
 
  619#ifdef HAVE_TPETRA_MMM_TIMINGS 
  620  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  621  using Teuchos::TimeMonitor;
 
  622  RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP M5 Cmap")))));
 
  624  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
 
  627  RCP<const import_type> Cimport;
 
  628  RCP<const map_type> Ccolmap;
 
  629  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
 
  630  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
 
  631  local_map_type Acolmap_local   = Aview.colMap->getLocalMap();
 
  632  local_map_type Prowmap_local   = Pview.origMatrix->getRowMap()->getLocalMap();
 
  633  local_map_type Irowmap_local;
 
  634  if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
 
  635  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
 
  636  local_map_type Icolmap_local;
 
  637  if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
 
  644  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Pcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
 
  646  if (Pview.importMatrix.is_null()) {
 
  649    Ccolmap             = Pview.colMap;
 
  650    const LO colMapSize = 
static_cast<LO
>(Pview.colMap->getLocalNumElements());
 
  652    Kokkos::parallel_for(
 
  653        "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
 
  654        Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
 
  655        KOKKOS_LAMBDA(
const LO i) {
 
  667    if (!Pimport.is_null() && !Iimport.is_null()) {
 
  668      Cimport = Pimport->setUnion(*Iimport);
 
  669    } 
else if (!Pimport.is_null() && Iimport.is_null()) {
 
  670      Cimport = Pimport->setUnion();
 
  671    } 
else if (Pimport.is_null() && !Iimport.is_null()) {
 
  672      Cimport = Iimport->setUnion();
 
  674      throw std::runtime_error(
"TpetraExt::RAP status of matrix importers is nonsensical");
 
  676    Ccolmap = Cimport->getTargetMap();
 
  681    TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
 
  682                               std::runtime_error, 
"Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
 
  689    Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
 
  690    local_map_type Ccolmap_local = Ccolmap->getLocalMap();
 
  691    Kokkos::parallel_for(
 
  692        "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement", range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
 
  693          Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
 
  695    Kokkos::parallel_for(
 
  696        "Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement", range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
 
  697          Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
 
  705  Ac.replaceColMap(Ccolmap);
 
  723  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
 
  724  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"), Aview.colMap->getLocalNumElements());
 
  726  Kokkos::parallel_for(
 
  727      "Tpetra::mult_R_A_P_newmatrix::construct_tables", range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(
const LO i) {
 
  728        GO aidx  = Acolmap_local.getGlobalElement(i);
 
  729        LO P_LID = Prowmap_local.getLocalElement(aidx);
 
  730        if (P_LID != LO_INVALID) {
 
  731          targetMapToOrigRow(i)   = P_LID;
 
  732          targetMapToImportRow(i) = LO_INVALID;
 
  734          LO I_LID                = Irowmap_local.getLocalElement(aidx);
 
  735          targetMapToOrigRow(i)   = LO_INVALID;
 
  736          targetMapToImportRow(i) = I_LID;
 
  742  KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
 
  743      mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
 
  744                                           targetMapToOrigRow, targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
 
  745                                           Ac, Cimport, label, params);
 
  749template <
class Scalar,
 
  753void mult_PT_A_P_reuse(
 
  754    CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
 
  755    CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
 
  756    CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
 
  757    const std::string& label,
 
  758    const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  759  using Teuchos::Array;
 
  760  using Teuchos::ArrayRCP;
 
  761  using Teuchos::ArrayView;
 
  766  typedef LocalOrdinal LO;
 
  767  typedef GlobalOrdinal GO;
 
  770  typedef Import<LO, GO, NO> import_type;
 
  771  typedef Map<LO, GO, NO> map_type;
 
  774  typedef typename map_type::local_map_type local_map_type;
 
  776  typedef typename KCRS::StaticCrsGraphType graph_t;
 
  777  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
 
  778  typedef typename NO::execution_space execution_space;
 
  779  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
 
  780  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
 
  782#ifdef HAVE_TPETRA_MMM_TIMINGS 
  783  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  784  using Teuchos::TimeMonitor;
 
  785  RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
 
  787  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
 
  790  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
 
  791  RCP<const map_type> Ccolmap    = Ac.getColMap();
 
  792  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
 
  793  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
 
  794  local_map_type Acolmap_local   = Aview.colMap->getLocalMap();
 
  795  local_map_type Prowmap_local   = Pview.origMatrix->getRowMap()->getLocalMap();
 
  796  local_map_type Irowmap_local;
 
  797  if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
 
  798  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
 
  799  local_map_type Icolmap_local;
 
  800  if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
 
  801  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
 
  804  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
 
  808    Kokkos::parallel_for(
 
  809        range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
 
  810          Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
 
  813    if (!Pview.importMatrix.is_null()) {
 
  814      TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
 
  815                                 std::runtime_error, 
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
 
  817      Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
 
  818      Kokkos::parallel_for(
 
  819          range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
 
  820            Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
 
  826  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
 
  827  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"), Aview.colMap->getLocalNumElements());
 
  828  Kokkos::parallel_for(
 
  829      range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(
const LO i) {
 
  830        GO aidx  = Acolmap_local.getGlobalElement(i);
 
  831        LO B_LID = Prowmap_local.getLocalElement(aidx);
 
  832        if (B_LID != LO_INVALID) {
 
  833          targetMapToOrigRow(i)   = B_LID;
 
  834          targetMapToImportRow(i) = LO_INVALID;
 
  836          LO I_LID                = Irowmap_local.getLocalElement(aidx);
 
  837          targetMapToOrigRow(i)   = LO_INVALID;
 
  838          targetMapToImportRow(i) = I_LID;
 
  844  KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
 
  845      mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
 
  846                                       targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
 
  847                                       Ac, Cimport, label, params);
 
  853template <
class Scalar,
 
  857          class LocalOrdinalViewType>
 
  858void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
 
  859                                                                                                                           CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
 
  860                                                                                                                           CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
 
  861                                                                                                                           const LocalOrdinalViewType& Acol2Prow_dev,
 
  862                                                                                                                           const LocalOrdinalViewType& Acol2PIrow_dev,
 
  863                                                                                                                           const LocalOrdinalViewType& Pcol2Accol_dev,
 
  864                                                                                                                           const LocalOrdinalViewType& PIcol2Accol_dev,
 
  865                                                                                                                           CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
 
  866                                                                                                                           Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
 
  867                                                                                                                           const std::string& label,
 
  868                                                                                                                           const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  869#ifdef HAVE_TPETRA_MMM_TIMINGS 
  870  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  871  using Teuchos::TimeMonitor;
 
  872  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix SerialCore"))));
 
  875  using Teuchos::Array;
 
  876  using Teuchos::ArrayRCP;
 
  877  using Teuchos::ArrayView;
 
  883  typedef typename KCRS::StaticCrsGraphType graph_t;
 
  884  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
 
  885  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
 
  886  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
 
  887  typedef typename KCRS::values_type::non_const_type scalar_view_t;
 
  890  typedef LocalOrdinal LO;
 
  891  typedef GlobalOrdinal GO;
 
  893  typedef Map<LO, GO, NO> map_type;
 
  894  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
 
  895  const LO LO_INVALID     = Teuchos::OrdinalTraits<LO>::invalid();
 
  896  const SC SC_ZERO        = Teuchos::ScalarTraits<Scalar>::zero();
 
  899  RCP<const map_type> Accolmap = Ac.getColMap();
 
  900  size_t m                     = Rview.origMatrix->getLocalNumRows();
 
  901  size_t n                     = Accolmap->getLocalNumElements();
 
  902  size_t p_max_nnz_per_row     = Pview.origMatrix->getLocalMaxNumRowEntries();
 
  905  auto Acol2Prow   = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
 
  907  auto Acol2PIrow  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
 
  909  auto Pcol2Accol  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
 
  911  auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
 
  915  const auto Amat = Aview.origMatrix->getLocalMatrixHost();
 
  916  const auto Pmat = Pview.origMatrix->getLocalMatrixHost();
 
  917  const auto Rmat = Rview.origMatrix->getLocalMatrixHost();
 
  919  auto Arowptr       = Amat.graph.row_map;
 
  920  auto Prowptr       = Pmat.graph.row_map;
 
  921  auto Rrowptr       = Rmat.graph.row_map;
 
  922  const auto Acolind = Amat.graph.entries;
 
  923  const auto Pcolind = Pmat.graph.entries;
 
  924  const auto Rcolind = Rmat.graph.entries;
 
  925  const auto Avals   = Amat.values;
 
  926  const auto Pvals   = Pmat.values;
 
  927  const auto Rvals   = Rmat.values;
 
  929  typename c_lno_view_t::host_mirror_type::const_type Irowptr;
 
  930  typename lno_nnz_view_t::host_mirror_type Icolind;
 
  931  typename scalar_view_t::host_mirror_type Ivals;
 
  932  if (!Pview.importMatrix.is_null()) {
 
  933    auto lclP         = Pview.importMatrix->getLocalMatrixHost();
 
  934    Irowptr           = lclP.graph.row_map;
 
  935    Icolind           = lclP.graph.entries;
 
  937    p_max_nnz_per_row = std::max(p_max_nnz_per_row, Pview.importMatrix->getLocalMaxNumRowEntries());
 
  940#ifdef HAVE_TPETRA_MMM_TIMINGS 
  941  RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix SerialCore - Compare"))));
 
  951  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
 
  952  typename lno_view_t::host_mirror_type Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"), m + 1);
 
  953  typename lno_nnz_view_t::host_mirror_type Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"), CSR_alloc);
 
  954  typename scalar_view_t::host_mirror_type Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"), CSR_alloc);
 
  964  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
 
  965  Array<size_t> ac_status(n, ST_INVALID);
 
  975  size_t nnz = 0, nnz_old = 0;
 
  976  for (
size_t i = 0; i < m; i++) {
 
  982    for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i + 1]; kk++) {
 
  984      const SC Rik = Rvals[kk];    
 
  988      for (
size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
 
  990        const SC Akl = Avals[ll];    
 
  994        if (Acol2Prow[l] != LO_INVALID) {
 
 1001          size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
 
 1004          for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
 
 1006            LO Acj = Pcol2Accol[j];
 
 1009            if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
 
 1010#ifdef HAVE_TPETRA_DEBUG 
 1012              TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
 
 1014                                         label << 
" ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
 
 1017              ac_status[Acj] = nnz;
 
 1019              Cvals[nnz]     = Rik * Akl * Plj;
 
 1022              Cvals[ac_status[Acj]] += Rik * Akl * Plj;
 
 1032          size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
 
 1033          for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
 
 1035            LO Acj = PIcol2Accol[j];
 
 1038            if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
 
 1039#ifdef HAVE_TPETRA_DEBUG 
 1041              TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
 
 1043                                         label << 
" ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
 
 1046              ac_status[Acj] = nnz;
 
 1048              Cvals[nnz]     = Rik * Akl * Plj;
 
 1051              Cvals[ac_status[Acj]] += Rik * Akl * Plj;
 
 1058    if (nnz + n > CSR_alloc) {
 
 1060      Kokkos::resize(Ccolind, CSR_alloc);
 
 1061      Kokkos::resize(Cvals, CSR_alloc);
 
 1069  Kokkos::resize(Ccolind, nnz);
 
 1070  Kokkos::resize(Cvals, nnz);
 
 1072#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1074  MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
 
 1076  auto Crowptr_dev = Kokkos::create_mirror_view_and_copy(
 
 1077      typename KCRS::device_type(), Crowptr);
 
 1078  auto Ccolind_dev = Kokkos::create_mirror_view_and_copy(
 
 1079      typename KCRS::device_type(), Ccolind);
 
 1080  auto Cvals_dev = Kokkos::create_mirror_view_and_copy(
 
 1081      typename KCRS::device_type(), Cvals);
 
 1084  if (params.is_null() || params->get(
"sort entries", 
true))
 
 1085    Import_Util::sortCrsEntries(Crowptr_dev, Ccolind_dev, Cvals_dev);
 
 1086  Ac.setAllValues(Crowptr_dev, Ccolind_dev, Cvals_dev);
 
 1088#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1090  MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
 
 1101  RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
 
 1102  labelList->set(
"Timer Label", label);
 
 1103  if (!params.is_null()) labelList->set(
"compute global constants", params->get(
"compute global constants", 
true));
 
 1104  RCP<const Export<LO, GO, NO> > dummyExport;
 
 1105  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
 
 1106                              Rview.origMatrix->getRangeMap(),
 
 1115template <
class Scalar,
 
 1117          class GlobalOrdinal,
 
 1119          class LocalOrdinalViewType>
 
 1120void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
 
 1121                                                                                                                       CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
 
 1122                                                                                                                       CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
 
 1123                                                                                                                       const LocalOrdinalViewType& Acol2Prow_dev,
 
 1124                                                                                                                       const LocalOrdinalViewType& Acol2PIrow_dev,
 
 1125                                                                                                                       const LocalOrdinalViewType& Pcol2Accol_dev,
 
 1126                                                                                                                       const LocalOrdinalViewType& PIcol2Accol_dev,
 
 1127                                                                                                                       CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
 
 1128                                                                                                                       Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
 
 1129                                                                                                                       const std::string& label,
 
 1130                                                                                                                       const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
 1131#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1132  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
 1133  using Teuchos::TimeMonitor;
 
 1134  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse SerialCore"))));
 
 1137  using Teuchos::Array;
 
 1138  using Teuchos::ArrayRCP;
 
 1139  using Teuchos::ArrayView;
 
 1144  typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
 
 1145  typedef typename KCRS::StaticCrsGraphType graph_t;
 
 1146  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
 
 1147  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
 
 1148  typedef typename KCRS::values_type::non_const_type scalar_view_t;
 
 1151  typedef LocalOrdinal LO;
 
 1152  typedef GlobalOrdinal GO;
 
 1154  typedef Map<LO, GO, NO> map_type;
 
 1155  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
 
 1156  const LO LO_INVALID     = Teuchos::OrdinalTraits<LO>::invalid();
 
 1157  const SC SC_ZERO        = Teuchos::ScalarTraits<Scalar>::zero();
 
 1160  RCP<const map_type> Accolmap = Ac.getColMap();
 
 1161  size_t m                     = Rview.origMatrix->getLocalNumRows();
 
 1162  size_t n                     = Accolmap->getLocalNumElements();
 
 1163  size_t p_max_nnz_per_row     = Pview.origMatrix->getLocalMaxNumRowEntries();
 
 1166  auto Acol2Prow   = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
 
 1168  auto Acol2PIrow  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
 
 1170  auto Pcol2Accol  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
 
 1172  auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
 
 1176  const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
 
 1177  const KCRS& Pmat = Pview.origMatrix->getLocalMatrixHost();
 
 1178  const KCRS& Rmat = Rview.origMatrix->getLocalMatrixHost();
 
 1179  const KCRS& Cmat = Ac.getLocalMatrixHost();
 
 1181  c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
 
 1182  const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
 
 1183  const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
 
 1184  scalar_view_t Cvals = Cmat.values;
 
 1186  c_lno_view_t Irowptr;
 
 1187  lno_nnz_view_t Icolind;
 
 1188  scalar_view_t Ivals;
 
 1189  if (!Pview.importMatrix.is_null()) {
 
 1190    auto lclP         = Pview.importMatrix->getLocalMatrixHost();
 
 1191    Irowptr           = lclP.graph.row_map;
 
 1192    Icolind           = lclP.graph.entries;
 
 1193    Ivals             = lclP.values;
 
 1194    p_max_nnz_per_row = std::max(p_max_nnz_per_row, Pview.importMatrix->getLocalMaxNumRowEntries());
 
 1197#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1198  RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse SerialCore - Compare"))));
 
 1209  Array<size_t> ac_status(n, ST_INVALID);
 
 1223  size_t OLD_ip = 0, CSR_ip = 0;
 
 1224  for (
size_t i = 0; i < m; i++) {
 
 1227    OLD_ip = Crowptr[i];
 
 1228    CSR_ip = Crowptr[i + 1];
 
 1229    for (
size_t k = OLD_ip; k < CSR_ip; k++) {
 
 1230      ac_status[Ccolind[k]] = k;
 
 1237    for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i + 1]; kk++) {
 
 1239      const SC Rik = Rvals[kk];    
 
 1243      for (
size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
 
 1245        const SC Akl = Avals[ll];    
 
 1249        if (Acol2Prow[l] != LO_INVALID) {
 
 1256          size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
 
 1259          for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
 
 1261            LO Cij = Pcol2Accol[j];
 
 1264            TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
 
 1265                                       std::runtime_error, 
"Trying to insert a new entry (" << i << 
"," << Cij << 
") into a static graph " 
 1266                                                                                            << 
"(c_status = " << ac_status[Cij] << 
" of [" << OLD_ip << 
"," << CSR_ip << 
"))");
 
 1268            Cvals[ac_status[Cij]] += Rik * Akl * Plj;
 
 1277          size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
 
 1278          for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
 
 1280            LO Cij = PIcol2Accol[j];
 
 1283            TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
 
 1284                                       std::runtime_error, 
"Trying to insert a new entry (" << i << 
"," << Cij << 
") into a static graph " 
 1285                                                                                            << 
"(c_status = " << ac_status[Cij] << 
" of [" << OLD_ip << 
"," << CSR_ip << 
"))");
 
 1287            Cvals[ac_status[Cij]] += Rik * Akl * Plj;
 
 1294#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1295  auto MM3 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse ESFC"))));
 
 1298  Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
 
 1304template <
class Scalar,
 
 1306          class GlobalOrdinal,
 
 1308          class LocalOrdinalViewType>
 
 1309void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
 
 1310                                                                                                                            CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
 
 1311                                                                                                                            const LocalOrdinalViewType& Acol2Prow,
 
 1312                                                                                                                            const LocalOrdinalViewType& Acol2PIrow,
 
 1313                                                                                                                            const LocalOrdinalViewType& Pcol2Accol,
 
 1314                                                                                                                            const LocalOrdinalViewType& PIcol2Accol,
 
 1315                                                                                                                            CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
 
 1316                                                                                                                            Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
 
 1317                                                                                                                            const std::string& label,
 
 1318                                                                                                                            const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
 1319#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1320  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
 1321  using Teuchos::TimeMonitor;
 
 1322  Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose")));
 
 1326  typedef RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer_type;
 
 1327  transposer_type transposer(Pview.origMatrix, label + std::string(
"XP: "));
 
 1329  using Teuchos::ParameterList;
 
 1331  RCP<ParameterList> transposeParams(
new ParameterList);
 
 1332  transposeParams->set(
"sort", 
false);
 
 1334  if (!params.is_null()) {
 
 1335    transposeParams->set(
"compute global constants",
 
 1336                         params->get(
"compute global constants: temporaries",
 
 1339  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
 
 1340      transposer.createTransposeLocal(transposeParams);
 
 1341  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
 
 1342  Rview.origMatrix = Ptrans;
 
 1344  mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview, Acol2Prow, Acol2PIrow, Pcol2Accol, PIcol2Accol, Ac, Acimport, label, params);
 
 1350template <
class Scalar,
 
 1352          class GlobalOrdinal,
 
 1354          class LocalOrdinalViewType>
 
 1355void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
 
 1356                                                                                                                        CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
 
 1357                                                                                                                        const LocalOrdinalViewType& Acol2Prow,
 
 1358                                                                                                                        const LocalOrdinalViewType& Acol2PIrow,
 
 1359                                                                                                                        const LocalOrdinalViewType& Pcol2Accol,
 
 1360                                                                                                                        const LocalOrdinalViewType& PIcol2Accol,
 
 1361                                                                                                                        CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
 
 1362                                                                                                                        Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
 
 1363                                                                                                                        const std::string& label,
 
 1364                                                                                                                        const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
 1365#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1366  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
 1367  using Teuchos::TimeMonitor;
 
 1368  Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose")));
 
 1372  typedef RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer_type;
 
 1373  transposer_type transposer(Pview.origMatrix, label + std::string(
"XP: "));
 
 1375  using Teuchos::ParameterList;
 
 1377  RCP<ParameterList> transposeParams(
new ParameterList);
 
 1378  transposeParams->set(
"sort", 
false);
 
 1380  if (!params.is_null()) {
 
 1381    transposeParams->set(
"compute global constants",
 
 1382                         params->get(
"compute global constants: temporaries",
 
 1385  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
 
 1386      transposer.createTransposeLocal(transposeParams);
 
 1387  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
 
 1388  Rview.origMatrix = Ptrans;
 
 1390  mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview, Acol2Prow, Acol2PIrow, Pcol2Accol, PIcol2Accol, Ac, Acimport, label, params);
 
 1398template <
class Scalar,
 
 1400          class GlobalOrdinal,
 
 1402void KernelWrappers3MMM<Scalar, LocalOrdinal, GlobalOrdinal, Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
 
 1403                                                                                                               CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
 
 1404                                                                                                               const Teuchos::Array<LocalOrdinal>& Acol2PRow,
 
 1405                                                                                                               const Teuchos::Array<LocalOrdinal>& Acol2PRowImport,
 
 1406                                                                                                               const Teuchos::Array<LocalOrdinal>& Pcol2Accol,
 
 1407                                                                                                               const Teuchos::Array<LocalOrdinal>& PIcol2Accol,
 
 1408                                                                                                               CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
 
 1409                                                                                                               Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
 
 1410                                                                                                               const std::string& label,
 
 1411                                                                                                               const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
 1412#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1413  std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
 1414  using Teuchos::TimeMonitor;
 
 1415  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix SerialCore"))));
 
 1418  using Teuchos::Array;
 
 1419  using Teuchos::ArrayRCP;
 
 1420  using Teuchos::ArrayView;
 
 1425  typedef LocalOrdinal LO;
 
 1426  typedef GlobalOrdinal GO;
 
 1428  typedef RowMatrixTransposer<SC, LO, GO, NO> transposer_type;
 
 1429  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
 
 1430  const SC SC_ZERO    = Teuchos::ScalarTraits<Scalar>::zero();
 
 1435  size_t n    = Ac.getRowMap()->getLocalNumElements();
 
 1436  LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
 
 1439  ArrayRCP<size_t> Acrowptr_RCP;
 
 1440  ArrayRCP<LO> Accolind_RCP;
 
 1441  ArrayRCP<SC> Acvals_RCP;
 
 1448  auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
 
 1449  auto Acolind = Aview.origMatrix->getLocalIndicesHost();
 
 1450  auto Avals   = Aview.origMatrix->getLocalValuesHost(
 
 1451        Tpetra::Access::ReadOnly);
 
 1452  auto Prowptr = Pview.origMatrix->getLocalRowPtrsHost();
 
 1453  auto Pcolind = Pview.origMatrix->getLocalIndicesHost();
 
 1454  auto Pvals   = Pview.origMatrix->getLocalValuesHost(
 
 1455        Tpetra::Access::ReadOnly);
 
 1456  decltype(Prowptr) Irowptr;
 
 1457  decltype(Pcolind) Icolind;
 
 1458  decltype(Pvals) Ivals;
 
 1460  if (!Pview.importMatrix.is_null()) {
 
 1461    Irowptr = Pview.importMatrix->getLocalRowPtrsHost();
 
 1462    Icolind = Pview.importMatrix->getLocalIndicesHost();
 
 1463    Ivals   = Pview.importMatrix->getLocalValuesHost(
 
 1464          Tpetra::Access::ReadOnly);
 
 1472  ArrayView<size_t> Acrowptr;
 
 1473  ArrayView<LO> Accolind;
 
 1474  ArrayView<SC> Acvals;
 
 1485#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1486  MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose"))));
 
 1493  transposer_type transposer(Pview.origMatrix, label + std::string(
"XP: "));
 
 1495  using Teuchos::ParameterList;
 
 1496  RCP<ParameterList> transposeParams(
new ParameterList);
 
 1497  transposeParams->set(
"sort", 
false);
 
 1498  if (!params.is_null()) {
 
 1499    transposeParams->set(
"compute global constants",
 
 1500                         params->get(
"compute global constants: temporaries",
 
 1503  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
 
 1504      transposer.createTransposeLocal(transposeParams);
 
 1506  auto Rrowptr = Ptrans->getLocalRowPtrsHost();
 
 1507  auto Rcolind = Ptrans->getLocalIndicesHost();
 
 1508  auto Rvals   = Ptrans->getLocalValuesHost(Tpetra::Access::ReadOnly);
 
 1513#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1514  MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP graph"))));
 
 1517  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
 
 1518  Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
 
 1520  size_t nnz_alloc  = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
 
 1521  size_t nnzPerRowA = 100;
 
 1522  if (Aview.origMatrix->getLocalNumEntries() > 0)
 
 1523    nnzPerRowA = Aview.origMatrix->getLocalNumEntries() / Aview.origMatrix->getLocalNumRows();
 
 1524  Acrowptr_RCP.resize(n + 1);
 
 1525  Acrowptr = Acrowptr_RCP();
 
 1526  Accolind_RCP.resize(nnz_alloc);
 
 1527  Accolind = Accolind_RCP();
 
 1529  size_t nnz = 0, nnz_old = 0;
 
 1530  for (
size_t i = 0; i < n; i++) {
 
 1536    for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i + 1]; kk++) {
 
 1539      for (
size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
 
 1542        if (Acol2PRow[l] != LO_INVALID) {
 
 1549          size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
 
 1552          for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
 
 1554            LO Acj = Pcol2Accol[j];
 
 1556            if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
 
 1558              ac_status[Acj] = nnz;
 
 1559              Accolind[nnz]  = Acj;
 
 1570          size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
 
 1571          for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
 
 1573            LO Acj = PIcol2Accol[j];
 
 1575            if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
 
 1577              ac_status[Acj] = nnz;
 
 1578              Accolind[nnz]  = Acj;
 
 1588    if (nnz + std::max(5 * nnzPerRowA, n) > nnz_alloc) {
 
 1590      nnz_alloc = std::max(nnz_alloc, nnz + std::max(5 * nnzPerRowA, n));
 
 1591      Accolind_RCP.resize(nnz_alloc);
 
 1592      Accolind = Accolind_RCP();
 
 1593      Acvals_RCP.resize(nnz_alloc);
 
 1594      Acvals = Acvals_RCP();
 
 1601  Accolind_RCP.resize(nnz);
 
 1602  Accolind = Accolind_RCP();
 
 1605  Acvals_RCP.resize(nnz, SC_ZERO);
 
 1606  Acvals = Acvals_RCP();
 
 1612#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1613  MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Fill Matrix"))));
 
 1616  for (
size_t k = 0; k < n; k++) {
 
 1617    for (
size_t ii = Prowptr[k]; ii < Prowptr[k + 1]; ii++) {
 
 1619      const SC Pki = Pvals[ii];
 
 1620      for (
size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
 
 1622        const SC Akl = Avals[ll];
 
 1625        if (Acol2PRow[l] != LO_INVALID) {
 
 1632          size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
 
 1633          for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
 
 1635            LO Acj = Pcol2Accol[j];
 
 1637            for (pp = Acrowptr[i]; pp < Acrowptr[i + 1]; pp++)
 
 1638              if (Accolind[pp] == Acj)
 
 1642            Acvals[pp] += Pki * Akl * Pvals[jj];
 
 1651          size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
 
 1652          for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
 
 1654            LO Acj = PIcol2Accol[j];
 
 1656            for (pp = Acrowptr[i]; pp < Acrowptr[i + 1]; pp++)
 
 1657              if (Accolind[pp] == Acj)
 
 1661            Acvals[pp] += Pki * Akl * Ivals[jj];
 
 1668#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1669  MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP sort"))));
 
 1676  Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
 
 1679  Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
 
 1681#ifdef HAVE_TPETRA_MMM_TIMINGS 
 1682  MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix ESFC"))));
 
 1693  RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
 
 1694  labelList->set(
"Timer Label", label);
 
 1696  if (!params.is_null()) labelList->set(
"compute global constants", params->get(
"compute global constants", 
true));
 
 1697  RCP<const Export<LO, GO, NO> > dummyExport;
 
 1698  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
 
 1699                              Pview.origMatrix->getDomainMap(),
 
 1701                              dummyExport, labelList);
 
 1713#define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR, LO, GO, NODE) \ 
 1715  template void TripleMatrixMultiply::MultiplyRAP(                \ 
 1716      const CrsMatrix<SCALAR, LO, GO, NODE>& R,                   \ 
 1718      const CrsMatrix<SCALAR, LO, GO, NODE>& A,                   \ 
 1720      const CrsMatrix<SCALAR, LO, GO, NODE>& P,                   \ 
 1722      CrsMatrix<SCALAR, LO, GO, NODE>& Ac,                        \ 
 1723      bool call_FillComplete_on_result,                           \ 
 1724      const std::string& label,                                   \ 
 1725      const Teuchos::RCP<Teuchos::ParameterList>& params); 
Utility functions for packing and unpacking sparse matrix entries.
 
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
 
Stand-alone utility functions and macros.
 
Struct that holds views of the contents of a CrsMatrix.
 
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
 
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
 
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Sparse matrix-matrix multiply.
 
Namespace Tpetra contains the class and methods constituting the Tpetra library.