Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
11#define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
12#include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
13#include "Tpetra_RowMatrixTransposer.hpp"
14
15namespace Tpetra {
16
17namespace MatrixMatrix {
18
19namespace ExtraKernels {
20
21// Double product version
22template <class CrsMatrixType>
23size_t C_estimate_nnz_per_row(CrsMatrixType& A, CrsMatrixType& B) {
24 // Follows the NZ estimate in ML's ml_matmatmult.c
25 size_t Aest = 100, Best = 100;
26 if (A.getLocalNumEntries() > 0)
27 Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 100;
28 if (B.getLocalNumEntries() > 0)
29 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries() / B.getLocalNumRows() : 100;
30
31 size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
32 nnzperrow *= nnzperrow;
33
34 return nnzperrow;
35}
36
37// Triple product version
38template <class CrsMatrixType>
39size_t Ac_estimate_nnz(CrsMatrixType& A, CrsMatrixType& P) {
40 size_t nnzPerRowA = 100, Pcols = 100;
41 if (A.getLocalNumEntries() > 0)
42 nnzPerRowA = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 9;
43 if (P.getLocalNumEntries() > 0)
44 Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
45 return (size_t)(Pcols * nnzPerRowA + 5 * nnzPerRowA + 300);
46}
47
48#if defined(HAVE_TPETRA_INST_OPENMP)
49/*********************************************************************************************************/
50template <class Scalar,
51 class LocalOrdinal,
52 class GlobalOrdinal,
53 class LocalOrdinalViewType>
54void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
55 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
56 const LocalOrdinalViewType& targetMapToOrigRow,
57 const LocalOrdinalViewType& targetMapToImportRow,
58 const LocalOrdinalViewType& Bcol2Ccol,
59 const LocalOrdinalViewType& Icol2Ccol,
60 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
61 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
62 const std::string& label,
63 const Teuchos::RCP<Teuchos::ParameterList>& params) {
64#ifdef HAVE_TPETRA_MMM_TIMINGS
65 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
66 using Teuchos::TimeMonitor;
67 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix LTGCore"))));
68#endif
69 using Teuchos::Array;
70 using Teuchos::ArrayRCP;
71 using Teuchos::ArrayView;
72 using Teuchos::RCP;
73 using Teuchos::rcp;
74
75 // Lots and lots of typedefs
77 typedef typename KCRS::device_type device_t;
78 typedef typename KCRS::StaticCrsGraphType graph_t;
79 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
80 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
81 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
82 typedef typename KCRS::values_type::non_const_type scalar_view_t;
83
84 // Unmanaged versions of the above
85 // typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
86 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
87 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
88
89 typedef Scalar SC;
90 typedef LocalOrdinal LO;
91 typedef GlobalOrdinal GO;
92 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
93 typedef Map<LO, GO, NO> map_type;
94
95 // NOTE (mfh 15 Sep 2017) This is specifically only for
96 // execution_space = Kokkos::OpenMP, so we neither need nor want
97 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
98 typedef NO::execution_space execution_space;
99 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
100
101 // All of the invalid guys
102 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
103 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
104 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
105
106 // Grab the Kokkos::SparseCrsMatrices & inner stuff
107 const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
108 const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
109
110 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
111 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
112 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
113 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
114
115 c_lno_view_t Irowptr;
116 lno_nnz_view_t Icolind;
117 scalar_view_t Ivals;
118 if (!Bview.importMatrix.is_null()) {
119 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
120 Irowptr = lclB.graph.row_map;
121 Icolind = lclB.graph.entries;
122 Ivals = lclB.values;
123 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
124 }
125
126 // Sizes
127 RCP<const map_type> Ccolmap = C.getColMap();
128 size_t m = Aview.origMatrix->getLocalNumRows();
129 size_t n = Ccolmap->getLocalNumElements();
130 size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
131
132 // Get my node / thread info (right from openmp or parameter list)
133 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
134 if (!params.is_null()) {
135 if (params->isParameter("openmp: ltg thread max"))
136 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
137 }
138
139 // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
140 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
141 // we will not touch these until the final copyout step
142 lno_nnz_view_t entriesC;
143 scalar_view_t valuesC;
144
145 // add this, since we do the rowptr in place, we could figure this out
146 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
147 lno_nnz_view_t thread_total_nnz("thread_total_nnz", thread_max + 1);
148
149 // Thread-local memory
150 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
151 Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
152
153 double thread_chunk = (double)(m) / thread_max;
154
155 // Run chunks of the matrix independently
156 Kokkos::parallel_for("MMM::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
157 // Thread coordination stuff
158 size_t my_thread_start = tid * thread_chunk;
159 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
160 size_t my_thread_m = my_thread_stop - my_thread_start;
161
162 // Size estimate
163 size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
164
165 // Allocations
166 std::vector<size_t> c_status(n, INVALID);
167
168 u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
169 u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
170
171 // For each row of A/C
172 size_t CSR_ip = 0, OLD_ip = 0;
173 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
174 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
175 // on the calling process.
176 // JJE 10 Apr 2019 index directly into the rowptr
177 row_mapC(i) = CSR_ip;
178
179 // mfh 27 Sep 2016: For each entry of A in the current row of A
180 for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
181 LO Aik = Acolind(k); // local column index of current entry of A
182 const SC Aval = Avals(k); // value of current entry of A
183 if (Aval == SC_ZERO)
184 continue; // skip explicitly stored zero values in A
185
186 if (targetMapToOrigRow(Aik) != LO_INVALID) {
187 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
188 // corresponding to the current entry of A is populated, then
189 // the corresponding row of B is in B_local (i.e., it lives on
190 // the calling process).
191
192 // Local matrix
193 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
194
195 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
196 for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
197 LO Bkj = Bcolind(j);
198 LO Cij = Bcol2Ccol(Bkj);
199
200 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
201 // New entry
202 c_status[Cij] = CSR_ip;
203 Ccolind(CSR_ip) = Cij;
204 Cvals(CSR_ip) = Aval * Bvals(j);
205 CSR_ip++;
206
207 } else {
208 Cvals(c_status[Cij]) += Aval * Bvals(j);
209 }
210 }
211
212 } else {
213 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
214 // corresponding to the current entry of A NOT populated (has
215 // a flag "invalid" value), then the corresponding row of B is
216 // in B_local (i.e., it lives on the calling process).
217
218 // Remote matrix
219 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
220 for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
221 LO Ikj = Icolind(j);
222 LO Cij = Icol2Ccol(Ikj);
223
224 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
225 // New entry
226 c_status[Cij] = CSR_ip;
227 Ccolind(CSR_ip) = Cij;
228 Cvals(CSR_ip) = Aval * Ivals(j);
229 CSR_ip++;
230
231 } else {
232 Cvals(c_status[Cij]) += Aval * Ivals(j);
233 }
234 }
235 }
236 }
237
238 // Resize for next pass if needed
239 if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1)) * b_max_nnz_per_row) > CSR_alloc) {
240 CSR_alloc *= 2;
241 Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
242 Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
243 }
244 OLD_ip = CSR_ip;
245 }
246 thread_total_nnz(tid) = CSR_ip;
247 tl_colind(tid) = Ccolind;
248 tl_values(tid) = Cvals;
249 });
250
251 // Do the copy out
252 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
253
254#ifdef HAVE_TPETRA_MMM_TIMINGS
255 MM = Teuchos::null;
256 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPSort"))));
257#endif
258 // Sort & set values
259 if (params.is_null() || params->get("sort entries", true))
260 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
261 C.setAllValues(row_mapC, entriesC, valuesC);
262}
263
264/*********************************************************************************************************/
265template <class Scalar,
266 class LocalOrdinal,
267 class GlobalOrdinal,
268 class LocalOrdinalViewType>
269void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
270 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
271 const LocalOrdinalViewType& targetMapToOrigRow,
272 const LocalOrdinalViewType& targetMapToImportRow,
273 const LocalOrdinalViewType& Bcol2Ccol,
274 const LocalOrdinalViewType& Icol2Ccol,
275 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
276 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
277 const std::string& label,
278 const Teuchos::RCP<Teuchos::ParameterList>& params) {
279#ifdef HAVE_TPETRA_MMM_TIMINGS
280 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
281 using Teuchos::TimeMonitor;
282 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse LTGCore"))));
283#endif
284
285 using Teuchos::Array;
286 using Teuchos::ArrayRCP;
287 using Teuchos::ArrayView;
288 using Teuchos::RCP;
289 using Teuchos::rcp;
290
291 // Lots and lots of typedefs
293 // typedef typename KCRS::device_type device_t;
294 typedef typename KCRS::StaticCrsGraphType graph_t;
295 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
296 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
297 typedef typename KCRS::values_type::non_const_type scalar_view_t;
298
299 typedef Scalar SC;
300 typedef LocalOrdinal LO;
301 typedef GlobalOrdinal GO;
302 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
303 typedef Map<LO, GO, NO> map_type;
304
305 // NOTE (mfh 15 Sep 2017) This is specifically only for
306 // execution_space = Kokkos::OpenMP, so we neither need nor want
307 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
308 typedef NO::execution_space execution_space;
309 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
310
311 // All of the invalid guys
312 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
313 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
314 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
315
316 // Grab the Kokkos::SparseCrsMatrices & inner stuff
317 const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
318 const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
319 const KCRS& Cmat = C.getLocalMatrixDevice();
320
321 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
322 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
323 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
324 scalar_view_t Cvals = Cmat.values;
325
326 c_lno_view_t Irowptr;
327 c_lno_nnz_view_t Icolind;
328 scalar_view_t Ivals;
329 if (!Bview.importMatrix.is_null()) {
330 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
331 Irowptr = lclB.graph.row_map;
332 Icolind = lclB.graph.entries;
333 Ivals = lclB.values;
334 }
335
336 // Sizes
337 RCP<const map_type> Ccolmap = C.getColMap();
338 size_t m = Aview.origMatrix->getLocalNumRows();
339 size_t n = Ccolmap->getLocalNumElements();
340
341 // Get my node / thread info (right from openmp or parameter list)
342 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
343 if (!params.is_null()) {
344 if (params->isParameter("openmp: ltg thread max"))
345 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
346 }
347
348 double thread_chunk = (double)(m) / thread_max;
349
350 // Run chunks of the matrix independently
351 Kokkos::parallel_for("MMM::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
352 // Thread coordination stuff
353 size_t my_thread_start = tid * thread_chunk;
354 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
355
356 // Allocations
357 std::vector<size_t> c_status(n, INVALID);
358
359 // For each row of A/C
360 size_t CSR_ip = 0, OLD_ip = 0;
361 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
362 // First fill the c_status array w/ locations where we're allowed to
363 // generate nonzeros for this row
364 OLD_ip = Crowptr(i);
365 CSR_ip = Crowptr(i + 1);
366 for (size_t k = OLD_ip; k < CSR_ip; k++) {
367 c_status[Ccolind(k)] = k;
368 // Reset values in the row of C
369 Cvals(k) = SC_ZERO;
370 }
371
372 for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
373 LO Aik = Acolind(k);
374 const SC Aval = Avals(k);
375 if (Aval == SC_ZERO)
376 continue;
377
378 if (targetMapToOrigRow(Aik) != LO_INVALID) {
379 // Local matrix
380 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
381
382 for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
383 LO Bkj = Bcolind(j);
384 LO Cij = Bcol2Ccol(Bkj);
385
386 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
387 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
388 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
389
390 Cvals(c_status[Cij]) += Aval * Bvals(j);
391 }
392 } else {
393 // Remote matrix
394 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
395 for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
396 LO Ikj = Icolind(j);
397 LO Cij = Icol2Ccol(Ikj);
398
399 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
400 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
401 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
402
403 Cvals(c_status[Cij]) += Aval * Ivals(j);
404 }
405 }
406 }
407 }
408 });
409
410 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
411}
412
413/*********************************************************************************************************/
414template <class Scalar,
415 class LocalOrdinal,
416 class GlobalOrdinal,
417 class LocalOrdinalViewType>
418void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
419 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
420 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
421 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
422 const LocalOrdinalViewType& targetMapToOrigRow,
423 const LocalOrdinalViewType& targetMapToImportRow,
424 const LocalOrdinalViewType& Bcol2Ccol,
425 const LocalOrdinalViewType& Icol2Ccol,
426 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
427 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
428 const std::string& label,
429 const Teuchos::RCP<Teuchos::ParameterList>& params) {
430#ifdef HAVE_TPETRA_MMM_TIMINGS
431 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
432 using Teuchos::TimeMonitor;
433 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix LTGCore"))));
434#endif
435
436 using Teuchos::Array;
437 using Teuchos::ArrayRCP;
438 using Teuchos::ArrayView;
439 using Teuchos::RCP;
440 using Teuchos::rcp;
441
442 // Lots and lots of typedefs
443 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
445 typedef typename KCRS::device_type device_t;
446 typedef typename KCRS::StaticCrsGraphType graph_t;
447 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
448 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
449 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
450 typedef typename KCRS::values_type::non_const_type scalar_view_t;
451
452 // Unmanaged versions of the above
453 // typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
454 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
455 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
456
457 // Jacobi-specific
458 typedef typename scalar_view_t::memory_space scalar_memory_space;
459
460 typedef Scalar SC;
461 typedef LocalOrdinal LO;
462 typedef GlobalOrdinal GO;
463 typedef Node NO;
464 typedef Map<LO, GO, NO> map_type;
465
466 // NOTE (mfh 15 Sep 2017) This is specifically only for
467 // execution_space = Kokkos::OpenMP, so we neither need nor want
468 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
469 typedef NO::execution_space execution_space;
470 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
471
472 // All of the invalid guys
473 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
474 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
475 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
476
477 // Grab the Kokkos::SparseCrsMatrices & inner stuff
478 const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
479 const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
480
481 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
482 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
483 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
484 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
485
486 c_lno_view_t Irowptr;
487 lno_nnz_view_t Icolind;
488 scalar_view_t Ivals;
489 if (!Bview.importMatrix.is_null()) {
490 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
491 Irowptr = lclB.graph.row_map;
492 Icolind = lclB.graph.entries;
493 Ivals = lclB.values;
494 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
495 }
496
497 // Jacobi-specific inner stuff
498 auto Dvals =
499 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
500
501 // Sizes
502 RCP<const map_type> Ccolmap = C.getColMap();
503 size_t m = Aview.origMatrix->getLocalNumRows();
504 size_t n = Ccolmap->getLocalNumElements();
505 size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
506
507 // Get my node / thread info (right from openmp)
508 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
509 if (!params.is_null()) {
510 if (params->isParameter("openmp: ltg thread max"))
511 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
512 }
513
514 // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
515 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
516 // we will not touch these until the final copyout step
517 lno_nnz_view_t entriesC;
518 scalar_view_t valuesC;
519
520 // add this, since we do the rowptr in place, we could figure this out
521 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
522 lno_nnz_view_t thread_total_nnz("thread_total_nnz", thread_max + 1);
523
524 // Thread-local memory
525 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
526 Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
527
528 double thread_chunk = (double)(m) / thread_max;
529
530 // Run chunks of the matrix independently
531 Kokkos::parallel_for("Jacobi::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
532 // Thread coordination stuff
533 size_t my_thread_start = tid * thread_chunk;
534 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
535 size_t my_thread_m = my_thread_stop - my_thread_start;
536
537 // Size estimate
538 size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
539
540 // Allocations
541 std::vector<size_t> c_status(n, INVALID);
542
543 u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
544 u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
545
546 // For each row of A/C
547 size_t CSR_ip = 0, OLD_ip = 0;
548 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
549 // printf("CMS: row %d CSR_alloc = %d\n",(int)i,(int)CSR_alloc);fflush(stdout);
550 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
551 // on the calling process.
552 // JJE: directly access the rowptr here (indexed using our loop var)
553 row_mapC(i) = CSR_ip;
554 // NOTE: Vector::getLocalView returns a rank 2 view here
555 SC minusOmegaDval = -omega * Dvals(i, 0);
556
557 // Entries of B
558 for (size_t j = Browptr(i); j < Browptr(i + 1); j++) {
559 const SC Bval = Bvals(j);
560 if (Bval == SC_ZERO)
561 continue;
562 LO Bij = Bcolind(j);
563 LO Cij = Bcol2Ccol(Bij);
564
565 // Assume no repeated entries in B
566 c_status[Cij] = CSR_ip;
567 Ccolind(CSR_ip) = Cij;
568 Cvals(CSR_ip) = Bvals[j];
569 CSR_ip++;
570 }
571
572 // Entries of -omega * Dinv * A * B
573 // mfh 27 Sep 2016: For each entry of A in the current row of A
574 for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
575 LO Aik = Acolind(k); // local column index of current entry of A
576 const SC Aval = Avals(k); // value of current entry of A
577 if (Aval == SC_ZERO)
578 continue; // skip explicitly stored zero values in A
579
580 if (targetMapToOrigRow(Aik) != LO_INVALID) {
581 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
582 // corresponding to the current entry of A is populated, then
583 // the corresponding row of B is in B_local (i.e., it lives on
584 // the calling process).
585
586 // Local matrix
587 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
588
589 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
590 for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
591 LO Bkj = Bcolind(j);
592 LO Cij = Bcol2Ccol(Bkj);
593
594 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
595 // New entry
596 c_status[Cij] = CSR_ip;
597 Ccolind(CSR_ip) = Cij;
598 Cvals(CSR_ip) = minusOmegaDval * Aval * Bvals(j);
599 CSR_ip++;
600
601 } else {
602 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
603 }
604 }
605
606 } else {
607 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
608 // corresponding to the current entry of A NOT populated (has
609 // a flag "invalid" value), then the corresponding row of B is
610 // in B_local (i.e., it lives on the calling process).
611
612 // Remote matrix
613 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
614 for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
615 LO Ikj = Icolind(j);
616 LO Cij = Icol2Ccol(Ikj);
617
618 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
619 // New entry
620 c_status[Cij] = CSR_ip;
621 Ccolind(CSR_ip) = Cij;
622 Cvals(CSR_ip) = minusOmegaDval * Aval * Ivals(j);
623 CSR_ip++;
624
625 } else {
626 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
627 }
628 }
629 }
630 }
631
632 // Resize for next pass if needed
633 if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1) + 1) * b_max_nnz_per_row) > CSR_alloc) {
634 CSR_alloc *= 2;
635 Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
636 Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
637 }
638 OLD_ip = CSR_ip;
639 }
640 thread_total_nnz(tid) = CSR_ip;
641 tl_colind(tid) = Ccolind;
642 tl_values(tid) = Cvals;
643 });
644
645 // Do the copy out (removed the tl_rowptr!)
646 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
647
648#ifdef HAVE_TPETRA_MMM_TIMINGS
649 MM = Teuchos::null;
650 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix OpenMPSort"))));
651#endif
652 // Sort & set values
653 if (params.is_null() || params->get("sort entries", true))
654 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
655 C.setAllValues(row_mapC, entriesC, valuesC);
656}
657
658/*********************************************************************************************************/
659template <class Scalar,
660 class LocalOrdinal,
661 class GlobalOrdinal,
662 class LocalOrdinalViewType>
663void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
664 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
665 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
666 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
667 const LocalOrdinalViewType& targetMapToOrigRow,
668 const LocalOrdinalViewType& targetMapToImportRow,
669 const LocalOrdinalViewType& Bcol2Ccol,
670 const LocalOrdinalViewType& Icol2Ccol,
671 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
672 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
673 const std::string& label,
674 const Teuchos::RCP<Teuchos::ParameterList>& params) {
675#ifdef HAVE_TPETRA_MMM_TIMINGS
676 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
677 using Teuchos::TimeMonitor;
678 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse LTGCore"))));
679#endif
680 using Teuchos::Array;
681 using Teuchos::ArrayRCP;
682 using Teuchos::ArrayView;
683 using Teuchos::RCP;
684 using Teuchos::rcp;
685
686 // Lots and lots of typedefs
687 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
689 // typedef typename KCRS::device_type device_t;
690 typedef typename KCRS::StaticCrsGraphType graph_t;
691 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
692 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
693 typedef typename KCRS::values_type::non_const_type scalar_view_t;
694
695 // Jacobi-specific
696 typedef typename scalar_view_t::memory_space scalar_memory_space;
697
698 typedef Scalar SC;
699 typedef LocalOrdinal LO;
700 typedef GlobalOrdinal GO;
701 typedef Node NO;
702 typedef Map<LO, GO, NO> map_type;
703
704 // NOTE (mfh 15 Sep 2017) This is specifically only for
705 // execution_space = Kokkos::OpenMP, so we neither need nor want
706 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
707 typedef NO::execution_space execution_space;
708 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
709
710 // All of the invalid guys
711 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
712 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
713 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
714
715 // Grab the Kokkos::SparseCrsMatrices & inner stuff
716 const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
717 const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
718 const KCRS& Cmat = C.getLocalMatrixDevice();
719
720 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
721 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
722 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
723 scalar_view_t Cvals = Cmat.values;
724
725 c_lno_view_t Irowptr;
726 c_lno_nnz_view_t Icolind;
727 scalar_view_t Ivals;
728 if (!Bview.importMatrix.is_null()) {
729 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
730 Irowptr = lclB.graph.row_map;
731 Icolind = lclB.graph.entries;
732 Ivals = lclB.values;
733 }
734
735 // Jacobi-specific inner stuff
736 auto Dvals =
737 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
738
739 // Sizes
740 RCP<const map_type> Ccolmap = C.getColMap();
741 size_t m = Aview.origMatrix->getLocalNumRows();
742 size_t n = Ccolmap->getLocalNumElements();
743
744 // Get my node / thread info (right from openmp or parameter list)
745 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
746 if (!params.is_null()) {
747 if (params->isParameter("openmp: ltg thread max"))
748 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
749 }
750
751 double thread_chunk = (double)(m) / thread_max;
752
753 // Run chunks of the matrix independently
754 Kokkos::parallel_for("Jacobi::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
755 // Thread coordination stuff
756 size_t my_thread_start = tid * thread_chunk;
757 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
758
759 // Allocations
760 std::vector<size_t> c_status(n, INVALID);
761
762 // For each row of A/C
763 size_t CSR_ip = 0, OLD_ip = 0;
764 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
765 // First fill the c_status array w/ locations where we're allowed to
766 // generate nonzeros for this row
767 OLD_ip = Crowptr(i);
768 CSR_ip = Crowptr(i + 1);
769 // NOTE: Vector::getLocalView returns a rank 2 view here
770 SC minusOmegaDval = -omega * Dvals(i, 0);
771
772 for (size_t k = OLD_ip; k < CSR_ip; k++) {
773 c_status[Ccolind(k)] = k;
774 // Reset values in the row of C
775 Cvals(k) = SC_ZERO;
776 }
777
778 // Entries of B
779 for (size_t j = Browptr(i); j < Browptr(i + 1); j++) {
780 const SC Bval = Bvals(j);
781 if (Bval == SC_ZERO)
782 continue;
783 LO Bij = Bcolind(j);
784 LO Cij = Bcol2Ccol(Bij);
785
786 // Assume no repeated entries in B
787 Cvals(c_status[Cij]) += Bvals(j);
788 CSR_ip++;
789 }
790
791 for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
792 LO Aik = Acolind(k);
793 const SC Aval = Avals(k);
794 if (Aval == SC_ZERO)
795 continue;
796
797 if (targetMapToOrigRow(Aik) != LO_INVALID) {
798 // Local matrix
799 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
800
801 for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
802 LO Bkj = Bcolind(j);
803 LO Cij = Bcol2Ccol(Bkj);
804
805 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
806 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
807 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
808
809 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
810 }
811 } else {
812 // Remote matrix
813 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
814 for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
815 LO Ikj = Icolind(j);
816 LO Cij = Icol2Ccol(Ikj);
817
818 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
819 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
820 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
821
822 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
823 }
824 }
825 }
826 }
827 });
828
829 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
830}
831
832/*********************************************************************************************************/
833template <class InColindArrayType, class InValsArrayType,
834 class OutRowptrType, class OutColindType, class OutValsType>
835void copy_out_from_thread_memory(const OutColindType& thread_total_nnz,
836 const InColindArrayType& Incolind,
837 const InValsArrayType& Invalues,
838 const size_t m,
839 const double thread_chunk,
840 OutRowptrType& row_mapC,
841 OutColindType& entriesC,
842 OutValsType& valuesC) {
843 typedef OutRowptrType lno_view_t;
844 typedef OutColindType lno_nnz_view_t;
845 typedef OutValsType scalar_view_t;
846 typedef typename lno_view_t::execution_space execution_space;
847 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
848
849 // Generate the starting nnz number per thread
850 size_t thread_max = Incolind.size();
851 size_t c_nnz_size = 0;
852 // since this kernel is 'low thread count' is very likely, that we could
853 // sum over the thread_total_nnz and not go parallel, but since it is a view
854 // we don't know where the memory actually lives... so we kinda need to go parallel
855
856 lno_view_t thread_start_nnz("thread_nnz", thread_max + 1);
857 Kokkos::parallel_scan("LTG::Scan", range_type(0, thread_max).set_chunk_size(1), [=](const size_t i, size_t& update, const bool final) {
858 size_t mynnz = thread_total_nnz(i);
859 if (final) thread_start_nnz(i) = update;
860 update += mynnz;
861 if (final && i + 1 == thread_max) thread_start_nnz(i + 1) = update;
862 });
863 c_nnz_size = thread_start_nnz(thread_max);
864
865 // 2019 Apr 10 JJE: update the rowptr's final entry here
866 row_mapC(m) = thread_start_nnz(thread_max);
867
868 // Allocate output
869 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
870 entriesC = entriesC_;
871 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
872 valuesC = valuesC_;
873
874 // Copy out
875 Kokkos::parallel_for("LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
876 const size_t my_thread_start = tid * thread_chunk;
877 const size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
878 const size_t nnz_thread_start = thread_start_nnz(tid);
879 // we need this info, since we did the rowptr in place
880 const size_t nnz_thread_stop = thread_start_nnz(tid + 1);
881
882 // There are two fundamental operations:
883 // * Updateing the rowptr with the correct offset
884 // * copying entries and values
885
886 // First, update the rowptr, it since we did it inplace, it is a += operation
887 // in the paper, I experimented with a coloring scheme that had threads do
888 // do their copies in different orders. It wasn't obvious it was beneficial
889 // but it can be replicated, by choosing the array to copy first based on your
890 // thread id % 3
891 if (my_thread_start != 0) {
892 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
893 row_mapC(i) += nnz_thread_start;
894 }
895 }
896
897 // try to Kokkos::single() the alloc here. It should implicitly barrier
898 // thread 0 doesn't copy the rowptr, so it could hit the single first
899 // in the paper, I played a game that effectively got LTG down to a single
900 // OpenMP barrier. But doing that requires the ability to spawn a single
901 // parallel region. The Scan above was implemented using atomic adds
902 // and the barrier was only needed so you could allocate
903 //
904 // Since we can't spawn a single region, we could move the allocations
905 // here, and using 'single'. Most likely, thread 0 will hit it first allowing
906 // the other threads to update the rowptr while it allocates.
907
908 // next, bulk copy the vals/colind arrays
909 const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
910 for (size_t i = 0; i < my_num_nnz; ++i) {
911 entriesC(nnz_thread_start + i) = Incolind(tid)(i);
912 valuesC(nnz_thread_start + i) = Invalues(tid)(i);
913 }
914
915 // Free the unamanged views, let each thread deallocate its memory
916 // May need to cast away const here..
917 if (Incolind(tid).data()) free(Incolind(tid).data());
918 if (Invalues(tid).data()) free(Invalues(tid).data());
919 });
920} // end copy_out
921
922#endif // OpenMP
923
924/*********************************************************************************************************/
925template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
926void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
927 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
928 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
929 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
930 const LocalOrdinalViewType& Acol2Brow,
931 const LocalOrdinalViewType& Acol2Irow,
932 const LocalOrdinalViewType& Bcol2Ccol,
933 const LocalOrdinalViewType& Icol2Ccol,
934 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
935 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Cimport,
936 const std::string& label,
937 const Teuchos::RCP<Teuchos::ParameterList>& params) {
938#ifdef HAVE_TPETRA_MMM_TIMINGS
939 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
940 using Teuchos::TimeMonitor;
941 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK"))));
942 Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Multiply"))));
943 using Teuchos::rcp;
944#endif
945 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
946
947 // This kernel computes (I-omega Dinv A) B the slow way (for generality's sake, you must understand)
948
949 // 1) Multiply A*B
950 Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(new Matrix_t(C.getRowMap(), 0));
951 Tpetra::MMdetails::mult_A_B_newmatrix(Aview, Bview, *AB, label + std::string(" MSAK"), params);
952
953#ifdef HAVE_TPETRA_MMM_TIMINGS
954 MM2 = Teuchos::null;
955 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Scale"))));
956#endif
957
958 // 2) Scale A by Dinv
959 AB->leftScale(Dinv);
960
961#ifdef HAVE_TPETRA_MMM_TIMINGS
962 MM2 = Teuchos::null;
963 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Add"))));
964#endif
965
966 // 3) Add [-omega Dinv A] + B
967 Teuchos::ParameterList jparams;
968 if (!params.is_null()) {
969 jparams = *params;
970 jparams.set("label", label + std::string(" MSAK Add"));
971 }
972 Scalar one = Teuchos::ScalarTraits<Scalar>::one();
973 Tpetra::MatrixMatrix::add(one, false, *Bview.origMatrix, Scalar(-omega), false, *AB, C, AB->getDomainMap(), AB->getRangeMap(), Teuchos::rcp(&jparams, false));
974#ifdef HAVE_TPETRA_MMM_TIMINGS
975 MM2 = Teuchos::null;
976#endif
977} // jacobi_A_B_newmatrix_MultiplyScaleAddKernel
978
979#if defined(HAVE_TPETRA_INST_OPENMP)
980/*********************************************************************************************************/
981template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
982static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
983 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
984 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
985 const LocalOrdinalViewType& Acol2PRow,
986 const LocalOrdinalViewType& Acol2PRowImport,
987 const LocalOrdinalViewType& Pcol2Accol,
988 const LocalOrdinalViewType& PIcol2Accol,
989 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
990 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
991 const std::string& label,
992 const Teuchos::RCP<Teuchos::ParameterList>& params) {
993 using Teuchos::RCP;
994 using Tpetra::MatrixMatrix::UnmanagedView;
995#ifdef HAVE_TPETRA_MMM_TIMINGS
996 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
997 using Teuchos::rcp;
998 using Teuchos::TimeMonitor;
999 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix LTGCore"))));
1000#endif
1001
1002 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1003 typedef Scalar SC;
1004 typedef LocalOrdinal LO;
1005 typedef GlobalOrdinal GO;
1006 typedef Node NO;
1007 typedef Map<LO, GO, NO> map_type;
1009 typedef typename KCRS::StaticCrsGraphType graph_t;
1010 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1011 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1012 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1013 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1014 typedef typename KCRS::device_type device_t;
1015 typedef typename device_t::execution_space execution_space;
1016 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1017
1018 // Unmanaged versions of the above
1019 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1020 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1021 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1022
1023 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1024 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1025
1026 // Sizes
1027 RCP<const map_type> Accolmap = Ac.getColMap();
1028 size_t m = Rview.origMatrix->getLocalNumRows();
1029 size_t n = Accolmap->getLocalNumElements();
1030
1031 // Get raw Kokkos matrices, and the raw CSR views
1032 const KCRS& Rmat = Rview.origMatrix->getLocalMatrixDevice();
1033 const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
1034 const KCRS& Pmat = Pview.origMatrix->getLocalMatrixDevice();
1035
1036 c_lno_view_t Rrowptr = Rmat.graph.row_map,
1037 Arowptr = Amat.graph.row_map,
1038 Prowptr = Pmat.graph.row_map, Irowptr;
1039 const lno_nnz_view_t Rcolind = Rmat.graph.entries,
1040 Acolind = Amat.graph.entries,
1041 Pcolind = Pmat.graph.entries;
1042 lno_nnz_view_t Icolind;
1043 const scalar_view_t Rvals = Rmat.values,
1044 Avals = Amat.values,
1045 Pvals = Pmat.values;
1046 scalar_view_t Ivals;
1047
1048 if (!Pview.importMatrix.is_null()) {
1049 const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1050 Irowptr = Imat.graph.row_map;
1051 Icolind = Imat.graph.entries;
1052 Ivals = Imat.values;
1053 }
1054
1055 // Classic csr assembly (low memory edition)
1056 //
1057 // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
1058 // The method loops over rows of R, and may resize after processing
1059 // each row. Chris Siefert says that this reflects experience in
1060 // ML; for the non-threaded case, ML found it faster to spend less
1061 // effort on estimation and risk an occasional reallocation.
1062
1063 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1064 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1065
1066 // Get my node / thread info (right from openmp or parameter list)
1067 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1068 if (!params.is_null()) {
1069 if (params->isParameter("openmp: ltg thread max"))
1070 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
1071 }
1072
1073 double thread_chunk = (double)(m) / thread_max;
1074
1075 // we can construct the rowptr inplace, allocate here and fault in parallel
1076 lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
1077 // we do not touch these until copy out
1078 lno_nnz_view_t entriesAc;
1079 scalar_view_t valuesAc;
1080
1081 // add this, since we do the rowptr in place, we could figure this out
1082 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
1083 lno_nnz_view_t thread_total_nnz("thread_total_nnz", thread_max + 1);
1084
1085 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1086 // routine. The routine computes Ac := R * A * (P_local + P_remote).
1087 //
1088 // For column index Aik in row i of A, Acol2PRow[Aik] tells
1089 // you whether the corresponding row of P belongs to P_local
1090 // ("orig") or P_remote ("Import").
1091
1092 // Thread-local memory
1093 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
1094 Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
1095
1096 // For each row of R
1097 Kokkos::parallel_for("MMM::RAP::NewMatrix::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
1098 // Thread coordination stuff
1099 size_t my_thread_start = tid * thread_chunk;
1100 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1101 size_t my_thread_m = my_thread_stop - my_thread_start;
1102
1103 size_t nnzAllocated = (size_t)(my_thread_m * Acest_nnz_per_row + 100);
1104
1105 std::vector<size_t> ac_status(n, INVALID);
1106
1107 // manually allocate the thread-local storage for Ac
1108 u_lno_view_t Acrowptr((typename u_lno_view_t::data_type)malloc(u_lno_view_t::shmem_size(my_thread_m + 1)), my_thread_m + 1);
1109 u_lno_nnz_view_t Accolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1110 u_scalar_view_t Acvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1111
1112 // count entries as they are added to Ac
1113 size_t nnz = 0, nnz_old = 0;
1114 // bmk: loop over the rows of R which are assigned to thread tid
1115 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1116 // directly index into the rowptr
1117 rowmapAc(i) = nnz;
1118 // mfh 27 Sep 2016: For each entry of R in the current row of R
1119 for (size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1120 LO k = Rcolind(kk); // local column index of current entry of R
1121 const SC Rik = Rvals(kk); // value of current entry of R
1122 if (Rik == SC_ZERO)
1123 continue; // skip explicitly stored zero values in R
1124 // For each entry of A in the current row of A
1125 for (size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1126 LO l = Acolind(ll); // local column index of current entry of A
1127 const SC Akl = Avals(ll); // value of current entry of A
1128 if (Akl == SC_ZERO)
1129 continue; // skip explicitly stored zero values in A
1130
1131 if (Acol2PRow[l] != LO_INVALID) {
1132 // mfh 27 Sep 2016: If the entry of Acol2PRow
1133 // corresponding to the current entry of A is populated, then
1134 // the corresponding row of P is in P_local (i.e., it lives on
1135 // the calling process).
1136
1137 // Local matrix
1138 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1139
1140 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1141 for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1142 LO j = Pcolind(jj);
1143 LO Acj = Pcol2Accol(j);
1144 SC Plj = Pvals(jj);
1145
1146 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1147#ifdef HAVE_TPETRA_DEBUG
1148 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1149 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1150 std::runtime_error,
1151 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1152#endif
1153 // New entry
1154 ac_status[Acj] = nnz;
1155 Accolind(nnz) = Acj;
1156 Acvals(nnz) = Rik * Akl * Plj;
1157 nnz++;
1158 } else {
1159 Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1160 }
1161 }
1162 } else {
1163 // mfh 27 Sep 2016: If the entry of Acol2PRow
1164 // corresponding to the current entry of A is NOT populated (has
1165 // a flag "invalid" value), then the corresponding row of P is
1166 // in P_remote (i.e., it does not live on the calling process).
1167
1168 // Remote matrix
1169 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1170 for (size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1171 LO j = Icolind(jj);
1172 LO Acj = PIcol2Accol(j);
1173 SC Plj = Ivals(jj);
1174
1175 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1176#ifdef HAVE_TPETRA_DEBUG
1177 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1178 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1179 std::runtime_error,
1180 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1181#endif
1182 // New entry
1183 ac_status[Acj] = nnz;
1184 Accolind(nnz) = Acj;
1185 Acvals(nnz) = Rik * Akl * Plj;
1186 nnz++;
1187 } else {
1188 Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1189 }
1190 }
1191 }
1192 }
1193 }
1194 // Resize for next pass if needed
1195 if (nnz + n > nnzAllocated) {
1196 nnzAllocated *= 2;
1197 Accolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1198 Acvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*)Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1199 }
1200 nnz_old = nnz;
1201 }
1202 thread_total_nnz(tid) = nnz;
1203 tl_colind(tid) = Accolind;
1204 tl_values(tid) = Acvals;
1205 });
1206#ifdef HAVE_TPETRA_MMM_TIMINGS
1207 MM = Teuchos::null;
1208 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix copy from thread local"))));
1209#endif
1210
1211 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1212
1213#ifdef HAVE_TPETRA_MMM_TIMINGS
1214 MM = Teuchos::null;
1215 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1216#endif
1217
1218 // Final sort & set of CRS arrays
1219 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1220 // mfh 27 Sep 2016: This just sets pointers.
1221 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1222
1223#ifdef HAVE_TPETRA_MMM_TIMINGS
1224 MM = Teuchos::null;
1225 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1226#endif
1227
1228 // Final FillComplete
1229 //
1230 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1231 // Import (from domain Map to column Map) construction (which costs
1232 // lots of communication) by taking the previously constructed
1233 // Import object. We should be able to do this without interfering
1234 // with the implementation of the local part of sparse matrix-matrix
1235 // multiply above.
1236 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1237 labelList->set("Timer Label", label);
1238 if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
1239 RCP<const Export<LO, GO, NO> > dummyExport;
1240 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1241 Rview.origMatrix->getRangeMap(),
1242 Acimport,
1243 dummyExport,
1244 labelList);
1245}
1246
1247/*********************************************************************************************************/
1248template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1249static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1250 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1251 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1252 const LocalOrdinalViewType& Acol2PRow,
1253 const LocalOrdinalViewType& Acol2PRowImport,
1254 const LocalOrdinalViewType& Pcol2Accol,
1255 const LocalOrdinalViewType& PIcol2Accol,
1256 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1257 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1258 const std::string& label,
1259 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1260 using Teuchos::RCP;
1261 using Tpetra::MatrixMatrix::UnmanagedView;
1262#ifdef HAVE_TPETRA_MMM_TIMINGS
1263 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1264 using Teuchos::rcp;
1265 using Teuchos::TimeMonitor;
1266 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse LTGCore"))));
1267#endif
1268
1269 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1270 typedef Scalar SC;
1271 typedef LocalOrdinal LO;
1272 typedef GlobalOrdinal GO;
1273 typedef Node NO;
1274 typedef Map<LO, GO, NO> map_type;
1276 typedef typename KCRS::StaticCrsGraphType graph_t;
1277 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1278 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1279 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1280 typedef typename KCRS::device_type device_t;
1281 typedef typename device_t::execution_space execution_space;
1282 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1283
1284 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1285 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1286 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1287
1288 // Sizes
1289 RCP<const map_type> Accolmap = Ac.getColMap();
1290 size_t m = Rview.origMatrix->getLocalNumRows();
1291 size_t n = Accolmap->getLocalNumElements();
1292
1293 // Get raw Kokkos matrices, and the raw CSR views
1294 const KCRS& Rmat = Rview.origMatrix->getLocalMatrixDevice();
1295 const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
1296 const KCRS& Pmat = Pview.origMatrix->getLocalMatrixDevice();
1297 const KCRS& Cmat = Ac.getLocalMatrixDevice();
1298
1299 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1300 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1301 lno_nnz_view_t Icolind;
1302 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1303 scalar_view_t Cvals = Cmat.values;
1304 scalar_view_t Ivals;
1305
1306 if (!Pview.importMatrix.is_null()) {
1307 const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1308 Irowptr = Imat.graph.row_map;
1309 Icolind = Imat.graph.entries;
1310 Ivals = Imat.values;
1311 }
1312
1313 // Get my node / thread info (right from openmp or parameter list)
1314 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1315 if (!params.is_null()) {
1316 if (params->isParameter("openmp: ltg thread max"))
1317 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
1318 }
1319
1320 double thread_chunk = (double)(m) / thread_max;
1321
1322 // For each row of R
1323 Kokkos::parallel_for("MMM::RAP::Reuse::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
1324 // Thread coordination stuff
1325 size_t my_thread_start = tid * thread_chunk;
1326 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1327
1328 std::vector<size_t> c_status(n, INVALID);
1329
1330 // count entries as they are added to Ac
1331 size_t OLD_ip = 0, CSR_ip = 0;
1332 // bmk: loop over the rows of R which are assigned to thread tid
1333 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1334 // First fill the c_status array w/ locations where we're allowed to
1335 // generate nonzeros for this row
1336 OLD_ip = Crowptr(i);
1337 CSR_ip = Crowptr(i + 1);
1338 for (size_t k = OLD_ip; k < CSR_ip; k++) {
1339 c_status[Ccolind(k)] = k;
1340 // Reset values in the row of C
1341 Cvals(k) = SC_ZERO;
1342 }
1343
1344 // mfh 27 Sep 2016: For each entry of R in the current row of R
1345 for (size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1346 LO k = Rcolind(kk); // local column index of current entry of R
1347 const SC Rik = Rvals(kk); // value of current entry of R
1348 if (Rik == SC_ZERO)
1349 continue; // skip explicitly stored zero values in R
1350 // For each entry of A in the current row of A
1351 for (size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1352 LO l = Acolind(ll); // local column index of current entry of A
1353 const SC Akl = Avals(ll); // value of current entry of A
1354 if (Akl == SC_ZERO)
1355 continue; // skip explicitly stored zero values in A
1356
1357 if (Acol2PRow[l] != LO_INVALID) {
1358 // mfh 27 Sep 2016: If the entry of Acol2PRow
1359 // corresponding to the current entry of A is populated, then
1360 // the corresponding row of P is in P_local (i.e., it lives on
1361 // the calling process).
1362
1363 // Local matrix
1364 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1365
1366 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1367 for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1368 LO j = Pcolind(jj);
1369 LO Cij = Pcol2Accol(j);
1370 SC Plj = Pvals(jj);
1371 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1372 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
1373 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1374
1375 Cvals(c_status[Cij]) += Rik * Akl * Plj;
1376 }
1377 } else {
1378 // mfh 27 Sep 2016: If the entry of Acol2PRow
1379 // corresponding to the current entry of A is NOT populated (has
1380 // a flag "invalid" value), then the corresponding row of P is
1381 // in P_remote (i.e., it does not live on the calling process).
1382
1383 // Remote matrix
1384 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1385 for (size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1386 LO j = Icolind(jj);
1387 LO Cij = PIcol2Accol(j);
1388 SC Plj = Ivals(jj);
1389 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1390 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
1391 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1392
1393 Cvals(c_status[Cij]) += Rik * Akl * Plj;
1394 }
1395 }
1396 }
1397 }
1398 }
1399 });
1400 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
1401}
1402
1403#endif
1404
1405} // namespace ExtraKernels
1406} // namespace MatrixMatrix
1407} // namespace Tpetra
1408
1409#endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
Namespace Tpetra contains the class and methods constituting the Tpetra library.