10#ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
11#define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
12#include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
13#include "Tpetra_RowMatrixTransposer.hpp"
17namespace MatrixMatrix {
19namespace ExtraKernels {
22template <
class CrsMatrixType>
23size_t C_estimate_nnz_per_row(CrsMatrixType& A, CrsMatrixType& B) {
25 size_t Aest = 100, Best = 100;
26 if (A.getLocalNumEntries() > 0)
27 Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 100;
28 if (B.getLocalNumEntries() > 0)
29 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries() / B.getLocalNumRows() : 100;
31 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
32 nnzperrow *= nnzperrow;
38template <
class CrsMatrixType>
39size_t Ac_estimate_nnz(CrsMatrixType& A, CrsMatrixType& P) {
40 size_t nnzPerRowA = 100, Pcols = 100;
41 if (A.getLocalNumEntries() > 0)
42 nnzPerRowA = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 9;
43 if (P.getLocalNumEntries() > 0)
44 Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
45 return (
size_t)(Pcols * nnzPerRowA + 5 * nnzPerRowA + 300);
48#if defined(HAVE_TPETRA_INST_OPENMP)
50template <
class Scalar,
53 class LocalOrdinalViewType>
54void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
55 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
56 const LocalOrdinalViewType& targetMapToOrigRow,
57 const LocalOrdinalViewType& targetMapToImportRow,
58 const LocalOrdinalViewType& Bcol2Ccol,
59 const LocalOrdinalViewType& Icol2Ccol,
60 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
61 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
62 const std::string& label,
63 const Teuchos::RCP<Teuchos::ParameterList>& params) {
64#ifdef HAVE_TPETRA_MMM_TIMINGS
65 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
66 using Teuchos::TimeMonitor;
67 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix LTGCore"))));
70 using Teuchos::ArrayRCP;
71 using Teuchos::ArrayView;
77 typedef typename KCRS::device_type device_t;
78 typedef typename KCRS::StaticCrsGraphType graph_t;
79 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
80 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
81 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
82 typedef typename KCRS::values_type::non_const_type scalar_view_t;
86 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
87 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
90 typedef LocalOrdinal LO;
91 typedef GlobalOrdinal GO;
92 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
93 typedef Map<LO, GO, NO> map_type;
98 typedef NO::execution_space execution_space;
99 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
102 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
103 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
104 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
106 bool skipExplicitZero =
true;
107 if (params && params->isParameter(
"MM Skip Explicit Zeros")) {
108 skipExplicitZero = params->get<
bool>(
"MM Skip Explicit Zeros");
112 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
113 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
115 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
116 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
117 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
118 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
120 c_lno_view_t Irowptr;
121 lno_nnz_view_t Icolind;
123 if (!Bview.importMatrix.is_null()) {
124 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
125 Irowptr = lclB.graph.row_map;
126 Icolind = lclB.graph.entries;
128 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
132 RCP<const map_type> Ccolmap = C.getColMap();
133 size_t m = Aview.origMatrix->getLocalNumRows();
134 size_t n = Ccolmap->getLocalNumElements();
135 size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
138 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
139 if (!params.is_null()) {
140 if (params->isParameter(
"openmp: ltg thread max"))
141 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
145 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
147 lno_nnz_view_t entriesC;
148 scalar_view_t valuesC;
152 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz", thread_max + 1);
155 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
156 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
158 double thread_chunk = (double)(m) / thread_max;
161 Kokkos::parallel_for(
"MMM::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
163 size_t my_thread_start = tid * thread_chunk;
164 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
165 size_t my_thread_m = my_thread_stop - my_thread_start;
168 size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
171 std::vector<size_t> c_status(n, INVALID);
173 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
174 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
177 size_t CSR_ip = 0, OLD_ip = 0;
178 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
182 row_mapC(i) = CSR_ip;
185 for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
187 const SC Aval = Avals(k);
188 if (Aval == SC_ZERO && skipExplicitZero)
191 if (targetMapToOrigRow(Aik) != LO_INVALID) {
198 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
201 for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
203 LO Cij = Bcol2Ccol(Bkj);
205 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
207 c_status[Cij] = CSR_ip;
208 Ccolind(CSR_ip) = Cij;
209 Cvals(CSR_ip) = Aval * Bvals(j);
213 Cvals(c_status[Cij]) += Aval * Bvals(j);
224 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
225 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
227 LO Cij = Icol2Ccol(Ikj);
229 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
231 c_status[Cij] = CSR_ip;
232 Ccolind(CSR_ip) = Cij;
233 Cvals(CSR_ip) = Aval * Ivals(j);
237 Cvals(c_status[Cij]) += Aval * Ivals(j);
244 if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1)) * b_max_nnz_per_row) > CSR_alloc) {
246 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
247 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
251 thread_total_nnz(tid) = CSR_ip;
252 tl_colind(tid) = Ccolind;
253 tl_values(tid) = Cvals;
257 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
259#ifdef HAVE_TPETRA_MMM_TIMINGS
261 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix OpenMPSort"))));
264 if (params.is_null() || params->get(
"sort entries",
true)) {
266 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC, ::KokkosSparse::SortAlgorithm::SHELL);
268 C.setAllValues(row_mapC, entriesC, valuesC);
272template <
class Scalar,
275 class LocalOrdinalViewType>
276void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
277 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
278 const LocalOrdinalViewType& targetMapToOrigRow,
279 const LocalOrdinalViewType& targetMapToImportRow,
280 const LocalOrdinalViewType& Bcol2Ccol,
281 const LocalOrdinalViewType& Icol2Ccol,
282 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
283 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
284 const std::string& label,
285 const Teuchos::RCP<Teuchos::ParameterList>& params) {
286#ifdef HAVE_TPETRA_MMM_TIMINGS
287 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
288 using Teuchos::TimeMonitor;
289 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse LTGCore"))));
292 using Teuchos::Array;
293 using Teuchos::ArrayRCP;
294 using Teuchos::ArrayView;
301 typedef typename KCRS::StaticCrsGraphType graph_t;
302 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
303 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
304 typedef typename KCRS::values_type::non_const_type scalar_view_t;
307 typedef LocalOrdinal LO;
308 typedef GlobalOrdinal GO;
309 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
310 typedef Map<LO, GO, NO> map_type;
315 typedef NO::execution_space execution_space;
316 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
319 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
320 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
321 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
324 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
325 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
326 const KCRS Cmat = C.getLocalMatrixDevice();
328 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
329 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
330 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
331 scalar_view_t Cvals = Cmat.values;
333 c_lno_view_t Irowptr;
334 c_lno_nnz_view_t Icolind;
336 if (!Bview.importMatrix.is_null()) {
337 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
338 Irowptr = lclB.graph.row_map;
339 Icolind = lclB.graph.entries;
344 RCP<const map_type> Ccolmap = C.getColMap();
345 size_t m = Aview.origMatrix->getLocalNumRows();
346 size_t n = Ccolmap->getLocalNumElements();
349 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
350 if (!params.is_null()) {
351 if (params->isParameter(
"openmp: ltg thread max"))
352 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
355 double thread_chunk = (double)(m) / thread_max;
358 Kokkos::parallel_for(
"MMM::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
360 size_t my_thread_start = tid * thread_chunk;
361 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
364 std::vector<size_t> c_status(n, INVALID);
367 size_t CSR_ip = 0, OLD_ip = 0;
368 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
372 CSR_ip = Crowptr(i + 1);
373 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
374 c_status[Ccolind(k)] = k;
379 for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
381 const SC Aval = Avals(k);
385 if (targetMapToOrigRow(Aik) != LO_INVALID) {
387 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
389 for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
391 LO Cij = Bcol2Ccol(Bkj);
393 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
394 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
395 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
397 Cvals(c_status[Cij]) += Aval * Bvals(j);
401 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
402 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
404 LO Cij = Icol2Ccol(Ikj);
406 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
407 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
408 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
410 Cvals(c_status[Cij]) += Aval * Ivals(j);
421template <
class Scalar,
424 class LocalOrdinalViewType>
425void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
426 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
427 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
428 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
429 const LocalOrdinalViewType& targetMapToOrigRow,
430 const LocalOrdinalViewType& targetMapToImportRow,
431 const LocalOrdinalViewType& Bcol2Ccol,
432 const LocalOrdinalViewType& Icol2Ccol,
433 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
434 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
435 const std::string& label,
436 const Teuchos::RCP<Teuchos::ParameterList>& params) {
437#ifdef HAVE_TPETRA_MMM_TIMINGS
438 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
439 using Teuchos::TimeMonitor;
440 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix LTGCore"))));
443 using Teuchos::Array;
444 using Teuchos::ArrayRCP;
445 using Teuchos::ArrayView;
450 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
452 typedef typename KCRS::device_type device_t;
453 typedef typename KCRS::StaticCrsGraphType graph_t;
454 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
455 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
456 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
457 typedef typename KCRS::values_type::non_const_type scalar_view_t;
461 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
462 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
465 typedef typename scalar_view_t::memory_space scalar_memory_space;
468 typedef LocalOrdinal LO;
469 typedef GlobalOrdinal GO;
471 typedef Map<LO, GO, NO> map_type;
476 typedef NO::execution_space execution_space;
477 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
480 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
481 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
482 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
485 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
486 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
488 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
489 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
490 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
491 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
493 c_lno_view_t Irowptr;
494 lno_nnz_view_t Icolind;
496 if (!Bview.importMatrix.is_null()) {
497 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
498 Irowptr = lclB.graph.row_map;
499 Icolind = lclB.graph.entries;
501 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
506 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
509 RCP<const map_type> Ccolmap = C.getColMap();
510 size_t m = Aview.origMatrix->getLocalNumRows();
511 size_t n = Ccolmap->getLocalNumElements();
512 size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
515 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
516 if (!params.is_null()) {
517 if (params->isParameter(
"openmp: ltg thread max"))
518 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
522 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
524 lno_nnz_view_t entriesC;
525 scalar_view_t valuesC;
529 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz", thread_max + 1);
532 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
533 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
535 double thread_chunk = (double)(m) / thread_max;
538 Kokkos::parallel_for(
"Jacobi::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
540 size_t my_thread_start = tid * thread_chunk;
541 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
542 size_t my_thread_m = my_thread_stop - my_thread_start;
545 size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
548 std::vector<size_t> c_status(n, INVALID);
550 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
551 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
554 size_t CSR_ip = 0, OLD_ip = 0;
555 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
560 row_mapC(i) = CSR_ip;
562 SC minusOmegaDval = -omega * Dvals(i, 0);
565 for (
size_t j = Browptr(i); j < Browptr(i + 1); j++) {
566 const SC Bval = Bvals(j);
570 LO Cij = Bcol2Ccol(Bij);
573 c_status[Cij] = CSR_ip;
574 Ccolind(CSR_ip) = Cij;
575 Cvals(CSR_ip) = Bvals[j];
581 for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
583 const SC Aval = Avals(k);
587 if (targetMapToOrigRow(Aik) != LO_INVALID) {
594 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
597 for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
599 LO Cij = Bcol2Ccol(Bkj);
601 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
603 c_status[Cij] = CSR_ip;
604 Ccolind(CSR_ip) = Cij;
605 Cvals(CSR_ip) = minusOmegaDval * Aval * Bvals(j);
609 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
620 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
621 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
623 LO Cij = Icol2Ccol(Ikj);
625 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
627 c_status[Cij] = CSR_ip;
628 Ccolind(CSR_ip) = Cij;
629 Cvals(CSR_ip) = minusOmegaDval * Aval * Ivals(j);
633 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
640 if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1) + 1) * b_max_nnz_per_row) > CSR_alloc) {
642 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
643 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
647 thread_total_nnz(tid) = CSR_ip;
648 tl_colind(tid) = Ccolind;
649 tl_values(tid) = Cvals;
653 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
655#ifdef HAVE_TPETRA_MMM_TIMINGS
657 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix OpenMPSort"))));
660 if (params.is_null() || params->get(
"sort entries",
true)) {
662 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC, ::KokkosSparse::SortAlgorithm::SHELL);
664 C.setAllValues(row_mapC, entriesC, valuesC);
668template <
class Scalar,
671 class LocalOrdinalViewType>
672void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
673 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
674 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
675 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
676 const LocalOrdinalViewType& targetMapToOrigRow,
677 const LocalOrdinalViewType& targetMapToImportRow,
678 const LocalOrdinalViewType& Bcol2Ccol,
679 const LocalOrdinalViewType& Icol2Ccol,
680 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
681 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
682 const std::string& label,
683 const Teuchos::RCP<Teuchos::ParameterList>& params) {
684#ifdef HAVE_TPETRA_MMM_TIMINGS
685 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
686 using Teuchos::TimeMonitor;
687 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse LTGCore"))));
689 using Teuchos::Array;
690 using Teuchos::ArrayRCP;
691 using Teuchos::ArrayView;
696 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
699 typedef typename KCRS::StaticCrsGraphType graph_t;
700 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
701 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
702 typedef typename KCRS::values_type::non_const_type scalar_view_t;
705 typedef typename scalar_view_t::memory_space scalar_memory_space;
708 typedef LocalOrdinal LO;
709 typedef GlobalOrdinal GO;
711 typedef Map<LO, GO, NO> map_type;
716 typedef NO::execution_space execution_space;
717 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
720 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
721 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
722 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
725 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
726 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
727 const KCRS Cmat = C.getLocalMatrixDevice();
729 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
730 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
731 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
732 scalar_view_t Cvals = Cmat.values;
734 c_lno_view_t Irowptr;
735 c_lno_nnz_view_t Icolind;
737 if (!Bview.importMatrix.is_null()) {
738 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
739 Irowptr = lclB.graph.row_map;
740 Icolind = lclB.graph.entries;
746 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
749 RCP<const map_type> Ccolmap = C.getColMap();
750 size_t m = Aview.origMatrix->getLocalNumRows();
751 size_t n = Ccolmap->getLocalNumElements();
754 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
755 if (!params.is_null()) {
756 if (params->isParameter(
"openmp: ltg thread max"))
757 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
760 double thread_chunk = (double)(m) / thread_max;
763 Kokkos::parallel_for(
"Jacobi::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
765 size_t my_thread_start = tid * thread_chunk;
766 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
769 std::vector<size_t> c_status(n, INVALID);
772 size_t CSR_ip = 0, OLD_ip = 0;
773 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
777 CSR_ip = Crowptr(i + 1);
779 SC minusOmegaDval = -omega * Dvals(i, 0);
781 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
782 c_status[Ccolind(k)] = k;
788 for (
size_t j = Browptr(i); j < Browptr(i + 1); j++) {
789 const SC Bval = Bvals(j);
793 LO Cij = Bcol2Ccol(Bij);
796 Cvals(c_status[Cij]) += Bvals(j);
800 for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
802 const SC Aval = Avals(k);
806 if (targetMapToOrigRow(Aik) != LO_INVALID) {
808 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
810 for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
812 LO Cij = Bcol2Ccol(Bkj);
814 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
815 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
816 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
818 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
822 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
823 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
825 LO Cij = Icol2Ccol(Ikj);
827 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
828 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
829 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
831 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
842template <
class InColindArrayType,
class InValsArrayType,
843 class OutRowptrType,
class OutColindType,
class OutValsType>
844void copy_out_from_thread_memory(
const OutColindType& thread_total_nnz,
845 const InColindArrayType& Incolind,
846 const InValsArrayType& Invalues,
848 const double thread_chunk,
849 OutRowptrType& row_mapC,
850 OutColindType& entriesC,
851 OutValsType& valuesC) {
852 typedef OutRowptrType lno_view_t;
853 typedef OutColindType lno_nnz_view_t;
854 typedef OutValsType scalar_view_t;
855 typedef typename lno_view_t::execution_space execution_space;
856 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
859 size_t thread_max = Incolind.size();
860 size_t c_nnz_size = 0;
865 lno_view_t thread_start_nnz(
"thread_nnz", thread_max + 1);
866 Kokkos::parallel_scan(
"LTG::Scan", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t i,
size_t& update,
const bool final) {
867 size_t mynnz = thread_total_nnz(i);
868 if (
final) thread_start_nnz(i) = update;
870 if (
final && i + 1 == thread_max) thread_start_nnz(i + 1) = update;
872 c_nnz_size = thread_start_nnz(thread_max);
875 row_mapC(m) = thread_start_nnz(thread_max);
878 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
879 entriesC = entriesC_;
880 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
884 Kokkos::parallel_for(
"LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
885 const size_t my_thread_start = tid * thread_chunk;
886 const size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
887 const size_t nnz_thread_start = thread_start_nnz(tid);
889 const size_t nnz_thread_stop = thread_start_nnz(tid + 1);
900 if (my_thread_start != 0) {
901 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
902 row_mapC(i) += nnz_thread_start;
918 const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
919 for (
size_t i = 0; i < my_num_nnz; ++i) {
920 entriesC(nnz_thread_start + i) = Incolind(tid)(i);
921 valuesC(nnz_thread_start + i) = Invalues(tid)(i);
926 if (Incolind(tid).data()) free(Incolind(tid).data());
927 if (Invalues(tid).data()) free(Invalues(tid).data());
934template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
935void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
936 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
937 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
938 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
939 const LocalOrdinalViewType& Acol2Brow,
940 const LocalOrdinalViewType& Acol2Irow,
941 const LocalOrdinalViewType& Bcol2Ccol,
942 const LocalOrdinalViewType& Icol2Ccol,
943 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
944 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Cimport,
945 const std::string& label,
946 const Teuchos::RCP<Teuchos::ParameterList>& params) {
947#ifdef HAVE_TPETRA_MMM_TIMINGS
948 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
949 using Teuchos::TimeMonitor;
950 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK"))));
951 Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Multiply"))));
954 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
959 Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(
new Matrix_t(C.getRowMap(), 0));
960 Tpetra::MMdetails::mult_A_B_newmatrix(Aview, Bview, *AB, label + std::string(
" MSAK"), params);
962#ifdef HAVE_TPETRA_MMM_TIMINGS
964 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Scale"))));
970#ifdef HAVE_TPETRA_MMM_TIMINGS
972 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Add"))));
976 Teuchos::ParameterList jparams;
977 if (!params.is_null()) {
979 jparams.set(
"label", label + std::string(
" MSAK Add"));
981 Scalar one = Teuchos::ScalarTraits<Scalar>::one();
982 Tpetra::MatrixMatrix::add(one,
false, *Bview.origMatrix, Scalar(-omega),
false, *AB, C, AB->getDomainMap(), AB->getRangeMap(), Teuchos::rcp(&jparams,
false));
983#ifdef HAVE_TPETRA_MMM_TIMINGS
988#if defined(HAVE_TPETRA_INST_OPENMP)
990template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
991static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
992 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
993 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
994 const LocalOrdinalViewType& Acol2PRow,
995 const LocalOrdinalViewType& Acol2PRowImport,
996 const LocalOrdinalViewType& Pcol2Accol,
997 const LocalOrdinalViewType& PIcol2Accol,
998 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
999 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1000 const std::string& label,
1001 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1003 using Tpetra::MatrixMatrix::UnmanagedView;
1004#ifdef HAVE_TPETRA_MMM_TIMINGS
1005 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1007 using Teuchos::TimeMonitor;
1008 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix LTGCore"))));
1011 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1013 typedef LocalOrdinal LO;
1014 typedef GlobalOrdinal GO;
1016 typedef Map<LO, GO, NO> map_type;
1018 typedef typename KCRS::StaticCrsGraphType graph_t;
1019 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1020 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1021 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1022 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1023 typedef typename KCRS::device_type device_t;
1024 typedef typename device_t::execution_space execution_space;
1025 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1028 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1029 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1030 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1032 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1033 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1036 RCP<const map_type> Accolmap = Ac.getColMap();
1037 size_t m = Rview.origMatrix->getLocalNumRows();
1038 size_t n = Accolmap->getLocalNumElements();
1041 const KCRS Rmat = Rview.origMatrix->getLocalMatrixDevice();
1042 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
1043 const KCRS Pmat = Pview.origMatrix->getLocalMatrixDevice();
1045 c_lno_view_t Rrowptr = Rmat.graph.row_map,
1046 Arowptr = Amat.graph.row_map,
1047 Prowptr = Pmat.graph.row_map, Irowptr;
1048 const lno_nnz_view_t Rcolind = Rmat.graph.entries,
1049 Acolind = Amat.graph.entries,
1050 Pcolind = Pmat.graph.entries;
1051 lno_nnz_view_t Icolind;
1052 const scalar_view_t Rvals = Rmat.values,
1053 Avals = Amat.values,
1054 Pvals = Pmat.values;
1055 scalar_view_t Ivals;
1057 if (!Pview.importMatrix.is_null()) {
1058 const KCRS Imat = Pview.importMatrix->getLocalMatrixDevice();
1059 Irowptr = Imat.graph.row_map;
1060 Icolind = Imat.graph.entries;
1061 Ivals = Imat.values;
1072 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1073 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1076 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1077 if (!params.is_null()) {
1078 if (params->isParameter(
"openmp: ltg thread max"))
1079 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
1082 double thread_chunk = (double)(m) / thread_max;
1085 lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
1087 lno_nnz_view_t entriesAc;
1088 scalar_view_t valuesAc;
1092 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz", thread_max + 1);
1102 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
1103 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
1106 Kokkos::parallel_for(
"MMM::RAP::NewMatrix::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
1108 size_t my_thread_start = tid * thread_chunk;
1109 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1110 size_t my_thread_m = my_thread_stop - my_thread_start;
1112 size_t nnzAllocated = (size_t)(my_thread_m * Acest_nnz_per_row + 100);
1114 std::vector<size_t> ac_status(n, INVALID);
1117 u_lno_view_t Acrowptr((
typename u_lno_view_t::data_type)malloc(u_lno_view_t::shmem_size(my_thread_m + 1)), my_thread_m + 1);
1118 u_lno_nnz_view_t Accolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1119 u_scalar_view_t Acvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1122 size_t nnz = 0, nnz_old = 0;
1124 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1128 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1130 const SC Rik = Rvals(kk);
1134 for (
size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1136 const SC Akl = Avals(ll);
1140 if (Acol2PRow[l] != LO_INVALID) {
1147 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1150 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1152 LO Acj = Pcol2Accol(j);
1155 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1156#ifdef HAVE_TPETRA_DEBUG
1158 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1160 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1163 ac_status[Acj] = nnz;
1164 Accolind(nnz) = Acj;
1165 Acvals(nnz) = Rik * Akl * Plj;
1168 Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1178 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1179 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1181 LO Acj = PIcol2Accol(j);
1184 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1185#ifdef HAVE_TPETRA_DEBUG
1187 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1189 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1192 ac_status[Acj] = nnz;
1193 Accolind(nnz) = Acj;
1194 Acvals(nnz) = Rik * Akl * Plj;
1197 Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1204 if (nnz + n > nnzAllocated) {
1206 Accolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1207 Acvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*)Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1211 thread_total_nnz(tid) = nnz;
1212 tl_colind(tid) = Accolind;
1213 tl_values(tid) = Acvals;
1215#ifdef HAVE_TPETRA_MMM_TIMINGS
1217 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix copy from thread local"))));
1220 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1222#ifdef HAVE_TPETRA_MMM_TIMINGS
1224 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
1229 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc, ::KokkosSparse::SortAlgorithm::SHELL);
1231 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1233#ifdef HAVE_TPETRA_MMM_TIMINGS
1235 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
1246 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1247 labelList->set(
"Timer Label", label);
1248 if (!params.is_null()) labelList->set(
"compute global constants", params->get(
"compute global constants",
true));
1249 RCP<const Export<LO, GO, NO> > dummyExport;
1250 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1251 Rview.origMatrix->getRangeMap(),
1258template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
1259static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1260 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1261 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1262 const LocalOrdinalViewType& Acol2PRow,
1263 const LocalOrdinalViewType& Acol2PRowImport,
1264 const LocalOrdinalViewType& Pcol2Accol,
1265 const LocalOrdinalViewType& PIcol2Accol,
1266 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1267 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1268 const std::string& label,
1269 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1271 using Tpetra::MatrixMatrix::UnmanagedView;
1272#ifdef HAVE_TPETRA_MMM_TIMINGS
1273 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1275 using Teuchos::TimeMonitor;
1276 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse LTGCore"))));
1279 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1281 typedef LocalOrdinal LO;
1282 typedef GlobalOrdinal GO;
1284 typedef Map<LO, GO, NO> map_type;
1286 typedef typename KCRS::StaticCrsGraphType graph_t;
1287 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1288 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1289 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1290 typedef typename KCRS::device_type device_t;
1291 typedef typename device_t::execution_space execution_space;
1292 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1294 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1295 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1296 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1299 RCP<const map_type> Accolmap = Ac.getColMap();
1300 size_t m = Rview.origMatrix->getLocalNumRows();
1301 size_t n = Accolmap->getLocalNumElements();
1304 const KCRS Rmat = Rview.origMatrix->getLocalMatrixDevice();
1305 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
1306 const KCRS Pmat = Pview.origMatrix->getLocalMatrixDevice();
1307 const KCRS Cmat = Ac.getLocalMatrixDevice();
1309 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1310 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1311 lno_nnz_view_t Icolind;
1312 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1313 scalar_view_t Cvals = Cmat.values;
1314 scalar_view_t Ivals;
1316 if (!Pview.importMatrix.is_null()) {
1317 const KCRS Imat = Pview.importMatrix->getLocalMatrixDevice();
1318 Irowptr = Imat.graph.row_map;
1319 Icolind = Imat.graph.entries;
1320 Ivals = Imat.values;
1324 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1325 if (!params.is_null()) {
1326 if (params->isParameter(
"openmp: ltg thread max"))
1327 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
1330 double thread_chunk = (double)(m) / thread_max;
1333 Kokkos::parallel_for(
"MMM::RAP::Reuse::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
1335 size_t my_thread_start = tid * thread_chunk;
1336 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1338 std::vector<size_t> c_status(n, INVALID);
1341 size_t OLD_ip = 0, CSR_ip = 0;
1343 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1346 OLD_ip = Crowptr(i);
1347 CSR_ip = Crowptr(i + 1);
1348 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
1349 c_status[Ccolind(k)] = k;
1355 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1357 const SC Rik = Rvals(kk);
1361 for (
size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1363 const SC Akl = Avals(ll);
1367 if (Acol2PRow[l] != LO_INVALID) {
1374 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1377 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1379 LO Cij = Pcol2Accol(j);
1381 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1382 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
1383 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1385 Cvals(c_status[Cij]) += Rik * Akl * Plj;
1394 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1395 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1397 LO Cij = PIcol2Accol(j);
1399 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1400 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
1401 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1403 Cvals(c_status[Cij]) += Rik * Akl * Plj;
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
Namespace Tpetra contains the class and methods constituting the Tpetra library.