10#ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
11#define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
12#include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
13#include "Tpetra_RowMatrixTransposer.hpp"
17namespace MatrixMatrix {
19namespace ExtraKernels {
22template <
class CrsMatrixType>
23size_t C_estimate_nnz_per_row(CrsMatrixType& A, CrsMatrixType& B) {
25 size_t Aest = 100, Best = 100;
26 if (A.getLocalNumEntries() > 0)
27 Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 100;
28 if (B.getLocalNumEntries() > 0)
29 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries() / B.getLocalNumRows() : 100;
31 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
32 nnzperrow *= nnzperrow;
38template <
class CrsMatrixType>
39size_t Ac_estimate_nnz(CrsMatrixType& A, CrsMatrixType& P) {
40 size_t nnzPerRowA = 100, Pcols = 100;
41 if (A.getLocalNumEntries() > 0)
42 nnzPerRowA = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 9;
43 if (P.getLocalNumEntries() > 0)
44 Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
45 return (
size_t)(Pcols * nnzPerRowA + 5 * nnzPerRowA + 300);
48#if defined(HAVE_TPETRA_INST_OPENMP)
50template <
class Scalar,
53 class LocalOrdinalViewType>
54void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
55 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
56 const LocalOrdinalViewType& targetMapToOrigRow,
57 const LocalOrdinalViewType& targetMapToImportRow,
58 const LocalOrdinalViewType& Bcol2Ccol,
59 const LocalOrdinalViewType& Icol2Ccol,
60 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
61 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
62 const std::string& label,
63 const Teuchos::RCP<Teuchos::ParameterList>& params) {
64#ifdef HAVE_TPETRA_MMM_TIMINGS
65 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
66 using Teuchos::TimeMonitor;
67 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix LTGCore"))));
70 using Teuchos::ArrayRCP;
71 using Teuchos::ArrayView;
77 typedef typename KCRS::device_type device_t;
78 typedef typename KCRS::StaticCrsGraphType graph_t;
79 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
80 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
81 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
82 typedef typename KCRS::values_type::non_const_type scalar_view_t;
86 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
87 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
90 typedef LocalOrdinal LO;
91 typedef GlobalOrdinal GO;
92 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
93 typedef Map<LO, GO, NO> map_type;
98 typedef NO::execution_space execution_space;
99 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
102 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
103 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
104 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
106 bool skipExplicitZero =
true;
107 if (params && params->isParameter(
"MM Skip Explicit Zeros")) {
108 skipExplicitZero = params->get<
bool>(
"MM Skip Explicit Zeros");
112 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
113 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
115 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
116 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
117 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
118 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
120 c_lno_view_t Irowptr;
121 lno_nnz_view_t Icolind;
123 if (!Bview.importMatrix.is_null()) {
124 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
125 Irowptr = lclB.graph.row_map;
126 Icolind = lclB.graph.entries;
128 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
132 RCP<const map_type> Ccolmap = C.getColMap();
133 size_t m = Aview.origMatrix->getLocalNumRows();
134 size_t n = Ccolmap->getLocalNumElements();
135 size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
138 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
139 if (!params.is_null()) {
140 if (params->isParameter(
"openmp: ltg thread max"))
141 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
145 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
147 lno_nnz_view_t entriesC;
148 scalar_view_t valuesC;
152 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz", thread_max + 1);
155 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
156 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
158 double thread_chunk = (double)(m) / thread_max;
161 Kokkos::parallel_for(
"MMM::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
163 size_t my_thread_start = tid * thread_chunk;
164 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
165 size_t my_thread_m = my_thread_stop - my_thread_start;
168 size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
171 std::vector<size_t> c_status(n, INVALID);
173 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
174 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
177 size_t CSR_ip = 0, OLD_ip = 0;
178 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
182 row_mapC(i) = CSR_ip;
185 for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
187 const SC Aval = Avals(k);
188 if (Aval == SC_ZERO && skipExplicitZero)
191 if (targetMapToOrigRow(Aik) != LO_INVALID) {
198 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
201 for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
203 LO Cij = Bcol2Ccol(Bkj);
205 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
207 c_status[Cij] = CSR_ip;
208 Ccolind(CSR_ip) = Cij;
209 Cvals(CSR_ip) = Aval * Bvals(j);
213 Cvals(c_status[Cij]) += Aval * Bvals(j);
224 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
225 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
227 LO Cij = Icol2Ccol(Ikj);
229 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
231 c_status[Cij] = CSR_ip;
232 Ccolind(CSR_ip) = Cij;
233 Cvals(CSR_ip) = Aval * Ivals(j);
237 Cvals(c_status[Cij]) += Aval * Ivals(j);
244 if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1)) * b_max_nnz_per_row) > CSR_alloc) {
246 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
247 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
251 thread_total_nnz(tid) = CSR_ip;
252 tl_colind(tid) = Ccolind;
253 tl_values(tid) = Cvals;
257 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
259#ifdef HAVE_TPETRA_MMM_TIMINGS
261 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix OpenMPSort"))));
264 if (params.is_null() || params->get(
"sort entries",
true)) {
266 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
268 C.setAllValues(row_mapC, entriesC, valuesC);
272template <
class Scalar,
275 class LocalOrdinalViewType>
276void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
277 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
278 const LocalOrdinalViewType& targetMapToOrigRow,
279 const LocalOrdinalViewType& targetMapToImportRow,
280 const LocalOrdinalViewType& Bcol2Ccol,
281 const LocalOrdinalViewType& Icol2Ccol,
282 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
283 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
284 const std::string& label,
285 const Teuchos::RCP<Teuchos::ParameterList>& params) {
288 bool throwOnInsert =
true;
289 if (!params.is_null() && params->isType<
bool>(
"MM Throw For Non-Existent Entries"))
290 throwOnInsert = params->get<
bool>(
"MM Throw For Non-Existent Entries");
292#ifdef HAVE_TPETRA_MMM_TIMINGS
293 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
294 using Teuchos::TimeMonitor;
295 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse LTGCore"))));
298 using Teuchos::Array;
299 using Teuchos::ArrayRCP;
300 using Teuchos::ArrayView;
307 typedef typename KCRS::StaticCrsGraphType graph_t;
308 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
309 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
310 typedef typename KCRS::values_type::non_const_type scalar_view_t;
313 typedef LocalOrdinal LO;
314 typedef GlobalOrdinal GO;
315 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
316 typedef Map<LO, GO, NO> map_type;
321 typedef NO::execution_space execution_space;
322 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
325 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
326 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
327 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
330 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
331 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
332 const KCRS Cmat = C.getLocalMatrixDevice();
334 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
335 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
336 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
337 scalar_view_t Cvals = Cmat.values;
339 c_lno_view_t Irowptr;
340 c_lno_nnz_view_t Icolind;
342 if (!Bview.importMatrix.is_null()) {
343 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
344 Irowptr = lclB.graph.row_map;
345 Icolind = lclB.graph.entries;
350 RCP<const map_type> Ccolmap = C.getColMap();
351 size_t m = Aview.origMatrix->getLocalNumRows();
352 size_t n = Ccolmap->getLocalNumElements();
355 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
356 if (!params.is_null()) {
357 if (params->isParameter(
"openmp: ltg thread max"))
358 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
361 double thread_chunk = (double)(m) / thread_max;
364 Kokkos::parallel_for(
"MMM::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
366 size_t my_thread_start = tid * thread_chunk;
367 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
370 std::vector<size_t> c_status(n, INVALID);
373 size_t CSR_ip = 0, OLD_ip = 0;
374 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
378 CSR_ip = Crowptr(i + 1);
379 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
380 c_status[Ccolind(k)] = k;
385 for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
387 const SC Aval = Avals(k);
391 if (targetMapToOrigRow(Aik) != LO_INVALID) {
393 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
395 for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
397 LO Cij = Bcol2Ccol(Bkj);
399 const bool badInsert = c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip;
401 Cvals(c_status[Cij]) += Aval * Bvals(j);
402 else if (throwOnInsert)
403 TEUCHOS_TEST_FOR_EXCEPTION(badInsert,
404 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
405 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
409 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
410 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
412 LO Cij = Icol2Ccol(Ikj);
414 const bool badInsert = c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip;
416 Cvals(c_status[Cij]) += Aval * Ivals(j);
417 else if (throwOnInsert)
418 TEUCHOS_TEST_FOR_EXCEPTION(badInsert,
419 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
420 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
431template <
class Scalar,
434 class LocalOrdinalViewType>
435void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(
typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
436 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
437 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
438 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
439 const LocalOrdinalViewType& targetMapToOrigRow,
440 const LocalOrdinalViewType& targetMapToImportRow,
441 const LocalOrdinalViewType& Bcol2Ccol,
442 const LocalOrdinalViewType& Icol2Ccol,
443 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
444 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
445 const std::string& label,
446 const Teuchos::RCP<Teuchos::ParameterList>& params) {
447#ifdef HAVE_TPETRA_MMM_TIMINGS
448 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
449 using Teuchos::TimeMonitor;
450 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix LTGCore"))));
453 using Teuchos::Array;
454 using Teuchos::ArrayRCP;
455 using Teuchos::ArrayView;
460 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
462 typedef typename KCRS::device_type device_t;
463 typedef typename KCRS::StaticCrsGraphType graph_t;
464 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
465 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
466 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
467 typedef typename KCRS::values_type::non_const_type scalar_view_t;
471 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
472 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
475 typedef typename scalar_view_t::memory_space scalar_memory_space;
478 typedef LocalOrdinal LO;
479 typedef GlobalOrdinal GO;
481 typedef Map<LO, GO, NO> map_type;
486 typedef NO::execution_space execution_space;
487 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
490 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
491 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
492 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
495 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
496 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
498 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
499 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
500 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
501 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
503 c_lno_view_t Irowptr;
504 lno_nnz_view_t Icolind;
506 if (!Bview.importMatrix.is_null()) {
507 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
508 Irowptr = lclB.graph.row_map;
509 Icolind = lclB.graph.entries;
511 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
516 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
519 RCP<const map_type> Ccolmap = C.getColMap();
520 size_t m = Aview.origMatrix->getLocalNumRows();
521 size_t n = Ccolmap->getLocalNumElements();
522 size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
525 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
526 if (!params.is_null()) {
527 if (params->isParameter(
"openmp: ltg thread max"))
528 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
532 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
534 lno_nnz_view_t entriesC;
535 scalar_view_t valuesC;
539 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz", thread_max + 1);
542 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
543 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
545 double thread_chunk = (double)(m) / thread_max;
548 Kokkos::parallel_for(
"Jacobi::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
550 size_t my_thread_start = tid * thread_chunk;
551 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
552 size_t my_thread_m = my_thread_stop - my_thread_start;
555 size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
558 std::vector<size_t> c_status(n, INVALID);
560 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
561 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
564 size_t CSR_ip = 0, OLD_ip = 0;
565 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
570 row_mapC(i) = CSR_ip;
572 SC minusOmegaDval = -omega * Dvals(i, 0);
575 for (
size_t j = Browptr(i); j < Browptr(i + 1); j++) {
576 const SC Bval = Bvals(j);
580 LO Cij = Bcol2Ccol(Bij);
583 c_status[Cij] = CSR_ip;
584 Ccolind(CSR_ip) = Cij;
585 Cvals(CSR_ip) = Bvals[j];
591 for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
593 const SC Aval = Avals(k);
597 if (targetMapToOrigRow(Aik) != LO_INVALID) {
604 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
607 for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
609 LO Cij = Bcol2Ccol(Bkj);
611 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
613 c_status[Cij] = CSR_ip;
614 Ccolind(CSR_ip) = Cij;
615 Cvals(CSR_ip) = minusOmegaDval * Aval * Bvals(j);
619 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
630 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
631 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
633 LO Cij = Icol2Ccol(Ikj);
635 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
637 c_status[Cij] = CSR_ip;
638 Ccolind(CSR_ip) = Cij;
639 Cvals(CSR_ip) = minusOmegaDval * Aval * Ivals(j);
643 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
650 if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1) + 1) * b_max_nnz_per_row) > CSR_alloc) {
652 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
653 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
657 thread_total_nnz(tid) = CSR_ip;
658 tl_colind(tid) = Ccolind;
659 tl_values(tid) = Cvals;
663 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
665#ifdef HAVE_TPETRA_MMM_TIMINGS
667 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix OpenMPSort"))));
670 if (params.is_null() || params->get(
"sort entries",
true)) {
672 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
674 C.setAllValues(row_mapC, entriesC, valuesC);
678template <
class Scalar,
681 class LocalOrdinalViewType>
682void jacobi_A_B_reuse_LowThreadGustavsonKernel(
typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
683 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
684 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
685 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
686 const LocalOrdinalViewType& targetMapToOrigRow,
687 const LocalOrdinalViewType& targetMapToImportRow,
688 const LocalOrdinalViewType& Bcol2Ccol,
689 const LocalOrdinalViewType& Icol2Ccol,
690 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
691 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
692 const std::string& label,
693 const Teuchos::RCP<Teuchos::ParameterList>& params) {
694#ifdef HAVE_TPETRA_MMM_TIMINGS
695 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
696 using Teuchos::TimeMonitor;
697 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse LTGCore"))));
699 using Teuchos::Array;
700 using Teuchos::ArrayRCP;
701 using Teuchos::ArrayView;
706 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
709 typedef typename KCRS::StaticCrsGraphType graph_t;
710 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
711 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
712 typedef typename KCRS::values_type::non_const_type scalar_view_t;
715 typedef typename scalar_view_t::memory_space scalar_memory_space;
718 typedef LocalOrdinal LO;
719 typedef GlobalOrdinal GO;
721 typedef Map<LO, GO, NO> map_type;
726 typedef NO::execution_space execution_space;
727 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
730 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
731 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
732 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
735 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
736 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
737 const KCRS Cmat = C.getLocalMatrixDevice();
739 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
740 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
741 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
742 scalar_view_t Cvals = Cmat.values;
744 c_lno_view_t Irowptr;
745 c_lno_nnz_view_t Icolind;
747 if (!Bview.importMatrix.is_null()) {
748 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
749 Irowptr = lclB.graph.row_map;
750 Icolind = lclB.graph.entries;
756 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
759 RCP<const map_type> Ccolmap = C.getColMap();
760 size_t m = Aview.origMatrix->getLocalNumRows();
761 size_t n = Ccolmap->getLocalNumElements();
764 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
765 if (!params.is_null()) {
766 if (params->isParameter(
"openmp: ltg thread max"))
767 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
770 double thread_chunk = (double)(m) / thread_max;
773 Kokkos::parallel_for(
"Jacobi::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
775 size_t my_thread_start = tid * thread_chunk;
776 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
779 std::vector<size_t> c_status(n, INVALID);
782 size_t CSR_ip = 0, OLD_ip = 0;
783 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
787 CSR_ip = Crowptr(i + 1);
789 SC minusOmegaDval = -omega * Dvals(i, 0);
791 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
792 c_status[Ccolind(k)] = k;
798 for (
size_t j = Browptr(i); j < Browptr(i + 1); j++) {
799 const SC Bval = Bvals(j);
803 LO Cij = Bcol2Ccol(Bij);
806 Cvals(c_status[Cij]) += Bvals(j);
810 for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
812 const SC Aval = Avals(k);
816 if (targetMapToOrigRow(Aik) != LO_INVALID) {
818 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
820 for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
822 LO Cij = Bcol2Ccol(Bkj);
824 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
825 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
826 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
828 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
832 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
833 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
835 LO Cij = Icol2Ccol(Ikj);
837 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
838 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
839 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
841 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
852template <
class InColindArrayType,
class InValsArrayType,
853 class OutRowptrType,
class OutColindType,
class OutValsType>
854void copy_out_from_thread_memory(
const OutColindType& thread_total_nnz,
855 const InColindArrayType& Incolind,
856 const InValsArrayType& Invalues,
858 const double thread_chunk,
859 OutRowptrType& row_mapC,
860 OutColindType& entriesC,
861 OutValsType& valuesC) {
862 typedef OutRowptrType lno_view_t;
863 typedef OutColindType lno_nnz_view_t;
864 typedef OutValsType scalar_view_t;
865 typedef typename lno_view_t::execution_space execution_space;
866 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
869 size_t thread_max = Incolind.size();
870 size_t c_nnz_size = 0;
875 lno_view_t thread_start_nnz(
"thread_nnz", thread_max + 1);
876 Kokkos::parallel_scan(
"LTG::Scan", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t i,
size_t& update,
const bool final) {
877 size_t mynnz = thread_total_nnz(i);
878 if (
final) thread_start_nnz(i) = update;
880 if (
final && i + 1 == thread_max) thread_start_nnz(i + 1) = update;
882 c_nnz_size = thread_start_nnz(thread_max);
885 row_mapC(m) = thread_start_nnz(thread_max);
888 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
889 entriesC = entriesC_;
890 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
894 Kokkos::parallel_for(
"LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
895 const size_t my_thread_start = tid * thread_chunk;
896 const size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
897 const size_t nnz_thread_start = thread_start_nnz(tid);
899 const size_t nnz_thread_stop = thread_start_nnz(tid + 1);
910 if (my_thread_start != 0) {
911 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
912 row_mapC(i) += nnz_thread_start;
928 const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
929 for (
size_t i = 0; i < my_num_nnz; ++i) {
930 entriesC(nnz_thread_start + i) = Incolind(tid)(i);
931 valuesC(nnz_thread_start + i) = Invalues(tid)(i);
936 if (Incolind(tid).data()) free(Incolind(tid).data());
937 if (Invalues(tid).data()) free(Invalues(tid).data());
944template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
945void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(
typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
946 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
947 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
948 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
949 const LocalOrdinalViewType& Acol2Brow,
950 const LocalOrdinalViewType& Acol2Irow,
951 const LocalOrdinalViewType& Bcol2Ccol,
952 const LocalOrdinalViewType& Icol2Ccol,
953 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
954 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Cimport,
955 const std::string& label,
956 const Teuchos::RCP<Teuchos::ParameterList>& params) {
957#ifdef HAVE_TPETRA_MMM_TIMINGS
958 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
959 using Teuchos::TimeMonitor;
960 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK"))));
961 Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Multiply"))));
964 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
969 Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(
new Matrix_t(C.getRowMap(), 0));
970 Tpetra::MMdetails::mult_A_B_newmatrix(Aview, Bview, *AB, label + std::string(
" MSAK"), params);
972#ifdef HAVE_TPETRA_MMM_TIMINGS
974 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Scale"))));
980#ifdef HAVE_TPETRA_MMM_TIMINGS
982 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Add"))));
986 Teuchos::ParameterList jparams;
987 if (!params.is_null()) {
989 jparams.set(
"label", label + std::string(
" MSAK Add"));
991 Scalar one = Teuchos::ScalarTraits<Scalar>::one();
992 const Scalar negJacobiCoefficient = (-omega) * one;
993 Tpetra::MatrixMatrix::add(one,
false, *Bview.origMatrix, negJacobiCoefficient,
false, *AB, C, AB->getDomainMap(), AB->getRangeMap(), Teuchos::rcp(&jparams,
false));
994#ifdef HAVE_TPETRA_MMM_TIMINGS
999#if defined(HAVE_TPETRA_INST_OPENMP)
1001template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
1002static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1003 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1004 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1005 const LocalOrdinalViewType& Acol2PRow,
1006 const LocalOrdinalViewType& Acol2PRowImport,
1007 const LocalOrdinalViewType& Pcol2Accol,
1008 const LocalOrdinalViewType& PIcol2Accol,
1009 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1010 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1011 const std::string& label,
1012 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1014 using Tpetra::MatrixMatrix::UnmanagedView;
1015#ifdef HAVE_TPETRA_MMM_TIMINGS
1016 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1018 using Teuchos::TimeMonitor;
1019 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix LTGCore"))));
1022 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1024 typedef LocalOrdinal LO;
1025 typedef GlobalOrdinal GO;
1027 typedef Map<LO, GO, NO> map_type;
1029 typedef typename KCRS::StaticCrsGraphType graph_t;
1030 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1031 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1032 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1033 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1034 typedef typename KCRS::device_type device_t;
1035 typedef typename device_t::execution_space execution_space;
1036 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1039 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1040 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1041 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1043 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1044 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1047 RCP<const map_type> Accolmap = Ac.getColMap();
1048 size_t m = Rview.origMatrix->getLocalNumRows();
1049 size_t n = Accolmap->getLocalNumElements();
1052 const KCRS Rmat = Rview.origMatrix->getLocalMatrixDevice();
1053 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
1054 const KCRS Pmat = Pview.origMatrix->getLocalMatrixDevice();
1056 c_lno_view_t Rrowptr = Rmat.graph.row_map,
1057 Arowptr = Amat.graph.row_map,
1058 Prowptr = Pmat.graph.row_map, Irowptr;
1059 const lno_nnz_view_t Rcolind = Rmat.graph.entries,
1060 Acolind = Amat.graph.entries,
1061 Pcolind = Pmat.graph.entries;
1062 lno_nnz_view_t Icolind;
1063 const scalar_view_t Rvals = Rmat.values,
1064 Avals = Amat.values,
1065 Pvals = Pmat.values;
1066 scalar_view_t Ivals;
1068 if (!Pview.importMatrix.is_null()) {
1069 const KCRS Imat = Pview.importMatrix->getLocalMatrixDevice();
1070 Irowptr = Imat.graph.row_map;
1071 Icolind = Imat.graph.entries;
1072 Ivals = Imat.values;
1083 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1084 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1087 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1088 if (!params.is_null()) {
1089 if (params->isParameter(
"openmp: ltg thread max"))
1090 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
1093 double thread_chunk = (double)(m) / thread_max;
1096 lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
1098 lno_nnz_view_t entriesAc;
1099 scalar_view_t valuesAc;
1103 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz", thread_max + 1);
1113 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
1114 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
1117 Kokkos::parallel_for(
"MMM::RAP::NewMatrix::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
1119 size_t my_thread_start = tid * thread_chunk;
1120 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1121 size_t my_thread_m = my_thread_stop - my_thread_start;
1123 size_t nnzAllocated = (size_t)(my_thread_m * Acest_nnz_per_row + 100);
1125 std::vector<size_t> ac_status(n, INVALID);
1128 u_lno_view_t Acrowptr((
typename u_lno_view_t::data_type)malloc(u_lno_view_t::shmem_size(my_thread_m + 1)), my_thread_m + 1);
1129 u_lno_nnz_view_t Accolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1130 u_scalar_view_t Acvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1133 size_t nnz = 0, nnz_old = 0;
1135 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1139 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1141 const SC Rik = Rvals(kk);
1145 for (
size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1147 const SC Akl = Avals(ll);
1151 if (Acol2PRow[l] != LO_INVALID) {
1158 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1161 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1163 LO Acj = Pcol2Accol(j);
1166 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1167#ifdef HAVE_TPETRA_DEBUG
1169 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1171 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1174 ac_status[Acj] = nnz;
1175 Accolind(nnz) = Acj;
1176 Acvals(nnz) = Rik * Akl * Plj;
1179 Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1189 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1190 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1192 LO Acj = PIcol2Accol(j);
1195 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1196#ifdef HAVE_TPETRA_DEBUG
1198 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1200 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1203 ac_status[Acj] = nnz;
1204 Accolind(nnz) = Acj;
1205 Acvals(nnz) = Rik * Akl * Plj;
1208 Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1215 if (nnz + n > nnzAllocated) {
1217 Accolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1218 Acvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*)Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1222 thread_total_nnz(tid) = nnz;
1223 tl_colind(tid) = Accolind;
1224 tl_values(tid) = Acvals;
1226#ifdef HAVE_TPETRA_MMM_TIMINGS
1228 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix copy from thread local"))));
1231 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1233#ifdef HAVE_TPETRA_MMM_TIMINGS
1235 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
1240 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1242 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1244#ifdef HAVE_TPETRA_MMM_TIMINGS
1246 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
1257 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1258 labelList->set(
"Timer Label", label);
1259 if (!params.is_null()) labelList->set(
"compute global constants", params->get(
"compute global constants",
true));
1260 RCP<const Export<LO, GO, NO> > dummyExport;
1261 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1262 Rview.origMatrix->getRangeMap(),
1269template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
1270static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1271 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1272 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1273 const LocalOrdinalViewType& Acol2PRow,
1274 const LocalOrdinalViewType& Acol2PRowImport,
1275 const LocalOrdinalViewType& Pcol2Accol,
1276 const LocalOrdinalViewType& PIcol2Accol,
1277 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1278 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1279 const std::string& label,
1280 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1282 using Tpetra::MatrixMatrix::UnmanagedView;
1283#ifdef HAVE_TPETRA_MMM_TIMINGS
1284 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1286 using Teuchos::TimeMonitor;
1287 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse LTGCore"))));
1290 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1292 typedef LocalOrdinal LO;
1293 typedef GlobalOrdinal GO;
1295 typedef Map<LO, GO, NO> map_type;
1297 typedef typename KCRS::StaticCrsGraphType graph_t;
1298 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1299 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1300 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1301 typedef typename KCRS::device_type device_t;
1302 typedef typename device_t::execution_space execution_space;
1303 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1305 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1306 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1307 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1310 RCP<const map_type> Accolmap = Ac.getColMap();
1311 size_t m = Rview.origMatrix->getLocalNumRows();
1312 size_t n = Accolmap->getLocalNumElements();
1315 const KCRS Rmat = Rview.origMatrix->getLocalMatrixDevice();
1316 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
1317 const KCRS Pmat = Pview.origMatrix->getLocalMatrixDevice();
1318 const KCRS Cmat = Ac.getLocalMatrixDevice();
1320 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1321 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1322 lno_nnz_view_t Icolind;
1323 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1324 scalar_view_t Cvals = Cmat.values;
1325 scalar_view_t Ivals;
1327 if (!Pview.importMatrix.is_null()) {
1328 const KCRS Imat = Pview.importMatrix->getLocalMatrixDevice();
1329 Irowptr = Imat.graph.row_map;
1330 Icolind = Imat.graph.entries;
1331 Ivals = Imat.values;
1335 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1336 if (!params.is_null()) {
1337 if (params->isParameter(
"openmp: ltg thread max"))
1338 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
1341 double thread_chunk = (double)(m) / thread_max;
1344 Kokkos::parallel_for(
"MMM::RAP::Reuse::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
1346 size_t my_thread_start = tid * thread_chunk;
1347 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1349 std::vector<size_t> c_status(n, INVALID);
1352 size_t OLD_ip = 0, CSR_ip = 0;
1354 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1357 OLD_ip = Crowptr(i);
1358 CSR_ip = Crowptr(i + 1);
1359 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
1360 c_status[Ccolind(k)] = k;
1366 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1368 const SC Rik = Rvals(kk);
1372 for (
size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1374 const SC Akl = Avals(ll);
1378 if (Acol2PRow[l] != LO_INVALID) {
1385 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1388 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1390 LO Cij = Pcol2Accol(j);
1392 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1393 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
1394 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1396 Cvals(c_status[Cij]) += Rik * Akl * Plj;
1405 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1406 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1408 LO Cij = PIcol2Accol(j);
1410 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1411 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
1412 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1414 Cvals(c_status[Cij]) += Rik * Akl * Plj;
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
Namespace Tpetra contains the class and methods constituting the Tpetra library.