Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
11#define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
12#include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
13#include "Tpetra_RowMatrixTransposer.hpp"
14
15namespace Tpetra {
16
17namespace MatrixMatrix {
18
19namespace ExtraKernels {
20
21// Double product version
22template <class CrsMatrixType>
23size_t C_estimate_nnz_per_row(CrsMatrixType& A, CrsMatrixType& B) {
24 // Follows the NZ estimate in ML's ml_matmatmult.c
25 size_t Aest = 100, Best = 100;
26 if (A.getLocalNumEntries() > 0)
27 Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 100;
28 if (B.getLocalNumEntries() > 0)
29 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries() / B.getLocalNumRows() : 100;
30
31 size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
32 nnzperrow *= nnzperrow;
33
34 return nnzperrow;
35}
36
37// Triple product version
38template <class CrsMatrixType>
39size_t Ac_estimate_nnz(CrsMatrixType& A, CrsMatrixType& P) {
40 size_t nnzPerRowA = 100, Pcols = 100;
41 if (A.getLocalNumEntries() > 0)
42 nnzPerRowA = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 9;
43 if (P.getLocalNumEntries() > 0)
44 Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
45 return (size_t)(Pcols * nnzPerRowA + 5 * nnzPerRowA + 300);
46}
47
48#if defined(HAVE_TPETRA_INST_OPENMP)
49/*********************************************************************************************************/
50template <class Scalar,
51 class LocalOrdinal,
52 class GlobalOrdinal,
53 class LocalOrdinalViewType>
54void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
55 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
56 const LocalOrdinalViewType& targetMapToOrigRow,
57 const LocalOrdinalViewType& targetMapToImportRow,
58 const LocalOrdinalViewType& Bcol2Ccol,
59 const LocalOrdinalViewType& Icol2Ccol,
60 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
61 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
62 const std::string& label,
63 const Teuchos::RCP<Teuchos::ParameterList>& params) {
64#ifdef HAVE_TPETRA_MMM_TIMINGS
65 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
66 using Teuchos::TimeMonitor;
67 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix LTGCore"))));
68#endif
69 using Teuchos::Array;
70 using Teuchos::ArrayRCP;
71 using Teuchos::ArrayView;
72 using Teuchos::RCP;
73 using Teuchos::rcp;
74
75 // Lots and lots of typedefs
77 typedef typename KCRS::device_type device_t;
78 typedef typename KCRS::StaticCrsGraphType graph_t;
79 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
80 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
81 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
82 typedef typename KCRS::values_type::non_const_type scalar_view_t;
83
84 // Unmanaged versions of the above
85 // typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
86 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
87 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
88
89 typedef Scalar SC;
90 typedef LocalOrdinal LO;
91 typedef GlobalOrdinal GO;
92 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
93 typedef Map<LO, GO, NO> map_type;
94
95 // NOTE (mfh 15 Sep 2017) This is specifically only for
96 // execution_space = Kokkos::OpenMP, so we neither need nor want
97 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
98 typedef NO::execution_space execution_space;
99 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
100
101 // All of the invalid guys
102 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
103 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
104 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
105
106 bool skipExplicitZero = true;
107 if (params && params->isParameter("MM Skip Explicit Zeros")) {
108 skipExplicitZero = params->get<bool>("MM Skip Explicit Zeros");
109 }
110
111 // Grab the Kokkos::SparseCrsMatrices & inner stuff
112 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
113 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
114
115 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
116 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
117 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
118 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
119
120 c_lno_view_t Irowptr;
121 lno_nnz_view_t Icolind;
122 scalar_view_t Ivals;
123 if (!Bview.importMatrix.is_null()) {
124 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
125 Irowptr = lclB.graph.row_map;
126 Icolind = lclB.graph.entries;
127 Ivals = lclB.values;
128 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
129 }
130
131 // Sizes
132 RCP<const map_type> Ccolmap = C.getColMap();
133 size_t m = Aview.origMatrix->getLocalNumRows();
134 size_t n = Ccolmap->getLocalNumElements();
135 size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
136
137 // Get my node / thread info (right from openmp or parameter list)
138 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
139 if (!params.is_null()) {
140 if (params->isParameter("openmp: ltg thread max"))
141 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
142 }
143
144 // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
145 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
146 // we will not touch these until the final copyout step
147 lno_nnz_view_t entriesC;
148 scalar_view_t valuesC;
149
150 // add this, since we do the rowptr in place, we could figure this out
151 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
152 lno_nnz_view_t thread_total_nnz("thread_total_nnz", thread_max + 1);
153
154 // Thread-local memory
155 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
156 Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
157
158 double thread_chunk = (double)(m) / thread_max;
159
160 // Run chunks of the matrix independently
161 Kokkos::parallel_for("MMM::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
162 // Thread coordination stuff
163 size_t my_thread_start = tid * thread_chunk;
164 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
165 size_t my_thread_m = my_thread_stop - my_thread_start;
166
167 // Size estimate
168 size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
169
170 // Allocations
171 std::vector<size_t> c_status(n, INVALID);
172
173 u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
174 u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
175
176 // For each row of A/C
177 size_t CSR_ip = 0, OLD_ip = 0;
178 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
179 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
180 // on the calling process.
181 // JJE 10 Apr 2019 index directly into the rowptr
182 row_mapC(i) = CSR_ip;
183
184 // mfh 27 Sep 2016: For each entry of A in the current row of A
185 for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
186 LO Aik = Acolind(k); // local column index of current entry of A
187 const SC Aval = Avals(k); // value of current entry of A
188 if (Aval == SC_ZERO && skipExplicitZero)
189 continue; // skip explicitly stored zero values in A
190
191 if (targetMapToOrigRow(Aik) != LO_INVALID) {
192 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
193 // corresponding to the current entry of A is populated, then
194 // the corresponding row of B is in B_local (i.e., it lives on
195 // the calling process).
196
197 // Local matrix
198 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
199
200 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
201 for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
202 LO Bkj = Bcolind(j);
203 LO Cij = Bcol2Ccol(Bkj);
204
205 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
206 // New entry
207 c_status[Cij] = CSR_ip;
208 Ccolind(CSR_ip) = Cij;
209 Cvals(CSR_ip) = Aval * Bvals(j);
210 CSR_ip++;
211
212 } else {
213 Cvals(c_status[Cij]) += Aval * Bvals(j);
214 }
215 }
216
217 } else {
218 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
219 // corresponding to the current entry of A NOT populated (has
220 // a flag "invalid" value), then the corresponding row of B is
221 // in B_local (i.e., it lives on the calling process).
222
223 // Remote matrix
224 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
225 for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
226 LO Ikj = Icolind(j);
227 LO Cij = Icol2Ccol(Ikj);
228
229 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
230 // New entry
231 c_status[Cij] = CSR_ip;
232 Ccolind(CSR_ip) = Cij;
233 Cvals(CSR_ip) = Aval * Ivals(j);
234 CSR_ip++;
235
236 } else {
237 Cvals(c_status[Cij]) += Aval * Ivals(j);
238 }
239 }
240 }
241 }
242
243 // Resize for next pass if needed
244 if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1)) * b_max_nnz_per_row) > CSR_alloc) {
245 CSR_alloc *= 2;
246 Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
247 Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
248 }
249 OLD_ip = CSR_ip;
250 }
251 thread_total_nnz(tid) = CSR_ip;
252 tl_colind(tid) = Ccolind;
253 tl_values(tid) = Cvals;
254 });
255
256 // Do the copy out
257 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
258
259#ifdef HAVE_TPETRA_MMM_TIMINGS
260 MM = Teuchos::null;
261 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPSort"))));
262#endif
263 // Sort & set values
264 if (params.is_null() || params->get("sort entries", true)) {
265 // Tpetra's SpGEMM results in almost sorted matrices. Use shell sort.
266 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC, ::KokkosSparse::SortAlgorithm::SHELL);
267 }
268 C.setAllValues(row_mapC, entriesC, valuesC);
269}
270
271/*********************************************************************************************************/
272template <class Scalar,
273 class LocalOrdinal,
274 class GlobalOrdinal,
275 class LocalOrdinalViewType>
276void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
277 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
278 const LocalOrdinalViewType& targetMapToOrigRow,
279 const LocalOrdinalViewType& targetMapToImportRow,
280 const LocalOrdinalViewType& Bcol2Ccol,
281 const LocalOrdinalViewType& Icol2Ccol,
282 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
283 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
284 const std::string& label,
285 const Teuchos::RCP<Teuchos::ParameterList>& params) {
286#ifdef HAVE_TPETRA_MMM_TIMINGS
287 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
288 using Teuchos::TimeMonitor;
289 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse LTGCore"))));
290#endif
291
292 using Teuchos::Array;
293 using Teuchos::ArrayRCP;
294 using Teuchos::ArrayView;
295 using Teuchos::RCP;
296 using Teuchos::rcp;
297
298 // Lots and lots of typedefs
300 // typedef typename KCRS::device_type device_t;
301 typedef typename KCRS::StaticCrsGraphType graph_t;
302 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
303 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
304 typedef typename KCRS::values_type::non_const_type scalar_view_t;
305
306 typedef Scalar SC;
307 typedef LocalOrdinal LO;
308 typedef GlobalOrdinal GO;
309 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
310 typedef Map<LO, GO, NO> map_type;
311
312 // NOTE (mfh 15 Sep 2017) This is specifically only for
313 // execution_space = Kokkos::OpenMP, so we neither need nor want
314 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
315 typedef NO::execution_space execution_space;
316 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
317
318 // All of the invalid guys
319 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
320 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
321 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
322
323 // Grab the Kokkos::SparseCrsMatrices & inner stuff
324 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
325 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
326 const KCRS Cmat = C.getLocalMatrixDevice();
327
328 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
329 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
330 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
331 scalar_view_t Cvals = Cmat.values;
332
333 c_lno_view_t Irowptr;
334 c_lno_nnz_view_t Icolind;
335 scalar_view_t Ivals;
336 if (!Bview.importMatrix.is_null()) {
337 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
338 Irowptr = lclB.graph.row_map;
339 Icolind = lclB.graph.entries;
340 Ivals = lclB.values;
341 }
342
343 // Sizes
344 RCP<const map_type> Ccolmap = C.getColMap();
345 size_t m = Aview.origMatrix->getLocalNumRows();
346 size_t n = Ccolmap->getLocalNumElements();
347
348 // Get my node / thread info (right from openmp or parameter list)
349 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
350 if (!params.is_null()) {
351 if (params->isParameter("openmp: ltg thread max"))
352 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
353 }
354
355 double thread_chunk = (double)(m) / thread_max;
356
357 // Run chunks of the matrix independently
358 Kokkos::parallel_for("MMM::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
359 // Thread coordination stuff
360 size_t my_thread_start = tid * thread_chunk;
361 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
362
363 // Allocations
364 std::vector<size_t> c_status(n, INVALID);
365
366 // For each row of A/C
367 size_t CSR_ip = 0, OLD_ip = 0;
368 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
369 // First fill the c_status array w/ locations where we're allowed to
370 // generate nonzeros for this row
371 OLD_ip = Crowptr(i);
372 CSR_ip = Crowptr(i + 1);
373 for (size_t k = OLD_ip; k < CSR_ip; k++) {
374 c_status[Ccolind(k)] = k;
375 // Reset values in the row of C
376 Cvals(k) = SC_ZERO;
377 }
378
379 for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
380 LO Aik = Acolind(k);
381 const SC Aval = Avals(k);
382 if (Aval == SC_ZERO)
383 continue;
384
385 if (targetMapToOrigRow(Aik) != LO_INVALID) {
386 // Local matrix
387 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
388
389 for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
390 LO Bkj = Bcolind(j);
391 LO Cij = Bcol2Ccol(Bkj);
392
393 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
394 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
395 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
396
397 Cvals(c_status[Cij]) += Aval * Bvals(j);
398 }
399 } else {
400 // Remote matrix
401 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
402 for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
403 LO Ikj = Icolind(j);
404 LO Cij = Icol2Ccol(Ikj);
405
406 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
407 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
408 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
409
410 Cvals(c_status[Cij]) += Aval * Ivals(j);
411 }
412 }
413 }
414 }
415 });
416
417 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
418}
419
420/*********************************************************************************************************/
421template <class Scalar,
422 class LocalOrdinal,
423 class GlobalOrdinal,
424 class LocalOrdinalViewType>
425void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
426 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
427 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
428 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
429 const LocalOrdinalViewType& targetMapToOrigRow,
430 const LocalOrdinalViewType& targetMapToImportRow,
431 const LocalOrdinalViewType& Bcol2Ccol,
432 const LocalOrdinalViewType& Icol2Ccol,
433 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
434 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
435 const std::string& label,
436 const Teuchos::RCP<Teuchos::ParameterList>& params) {
437#ifdef HAVE_TPETRA_MMM_TIMINGS
438 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
439 using Teuchos::TimeMonitor;
440 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix LTGCore"))));
441#endif
442
443 using Teuchos::Array;
444 using Teuchos::ArrayRCP;
445 using Teuchos::ArrayView;
446 using Teuchos::RCP;
447 using Teuchos::rcp;
448
449 // Lots and lots of typedefs
450 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
452 typedef typename KCRS::device_type device_t;
453 typedef typename KCRS::StaticCrsGraphType graph_t;
454 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
455 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
456 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
457 typedef typename KCRS::values_type::non_const_type scalar_view_t;
458
459 // Unmanaged versions of the above
460 // typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
461 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
462 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
463
464 // Jacobi-specific
465 typedef typename scalar_view_t::memory_space scalar_memory_space;
466
467 typedef Scalar SC;
468 typedef LocalOrdinal LO;
469 typedef GlobalOrdinal GO;
470 typedef Node NO;
471 typedef Map<LO, GO, NO> map_type;
472
473 // NOTE (mfh 15 Sep 2017) This is specifically only for
474 // execution_space = Kokkos::OpenMP, so we neither need nor want
475 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
476 typedef NO::execution_space execution_space;
477 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
478
479 // All of the invalid guys
480 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
481 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
482 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
483
484 // Grab the Kokkos::SparseCrsMatrices & inner stuff
485 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
486 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
487
488 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
489 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
490 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
491 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
492
493 c_lno_view_t Irowptr;
494 lno_nnz_view_t Icolind;
495 scalar_view_t Ivals;
496 if (!Bview.importMatrix.is_null()) {
497 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
498 Irowptr = lclB.graph.row_map;
499 Icolind = lclB.graph.entries;
500 Ivals = lclB.values;
501 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
502 }
503
504 // Jacobi-specific inner stuff
505 auto Dvals =
506 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
507
508 // Sizes
509 RCP<const map_type> Ccolmap = C.getColMap();
510 size_t m = Aview.origMatrix->getLocalNumRows();
511 size_t n = Ccolmap->getLocalNumElements();
512 size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
513
514 // Get my node / thread info (right from openmp)
515 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
516 if (!params.is_null()) {
517 if (params->isParameter("openmp: ltg thread max"))
518 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
519 }
520
521 // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
522 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
523 // we will not touch these until the final copyout step
524 lno_nnz_view_t entriesC;
525 scalar_view_t valuesC;
526
527 // add this, since we do the rowptr in place, we could figure this out
528 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
529 lno_nnz_view_t thread_total_nnz("thread_total_nnz", thread_max + 1);
530
531 // Thread-local memory
532 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
533 Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
534
535 double thread_chunk = (double)(m) / thread_max;
536
537 // Run chunks of the matrix independently
538 Kokkos::parallel_for("Jacobi::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
539 // Thread coordination stuff
540 size_t my_thread_start = tid * thread_chunk;
541 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
542 size_t my_thread_m = my_thread_stop - my_thread_start;
543
544 // Size estimate
545 size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
546
547 // Allocations
548 std::vector<size_t> c_status(n, INVALID);
549
550 u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
551 u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
552
553 // For each row of A/C
554 size_t CSR_ip = 0, OLD_ip = 0;
555 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
556 // printf("CMS: row %d CSR_alloc = %d\n",(int)i,(int)CSR_alloc);fflush(stdout);
557 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
558 // on the calling process.
559 // JJE: directly access the rowptr here (indexed using our loop var)
560 row_mapC(i) = CSR_ip;
561 // NOTE: Vector::getLocalView returns a rank 2 view here
562 SC minusOmegaDval = -omega * Dvals(i, 0);
563
564 // Entries of B
565 for (size_t j = Browptr(i); j < Browptr(i + 1); j++) {
566 const SC Bval = Bvals(j);
567 if (Bval == SC_ZERO)
568 continue;
569 LO Bij = Bcolind(j);
570 LO Cij = Bcol2Ccol(Bij);
571
572 // Assume no repeated entries in B
573 c_status[Cij] = CSR_ip;
574 Ccolind(CSR_ip) = Cij;
575 Cvals(CSR_ip) = Bvals[j];
576 CSR_ip++;
577 }
578
579 // Entries of -omega * Dinv * A * B
580 // mfh 27 Sep 2016: For each entry of A in the current row of A
581 for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
582 LO Aik = Acolind(k); // local column index of current entry of A
583 const SC Aval = Avals(k); // value of current entry of A
584 if (Aval == SC_ZERO)
585 continue; // skip explicitly stored zero values in A
586
587 if (targetMapToOrigRow(Aik) != LO_INVALID) {
588 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
589 // corresponding to the current entry of A is populated, then
590 // the corresponding row of B is in B_local (i.e., it lives on
591 // the calling process).
592
593 // Local matrix
594 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
595
596 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
597 for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
598 LO Bkj = Bcolind(j);
599 LO Cij = Bcol2Ccol(Bkj);
600
601 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
602 // New entry
603 c_status[Cij] = CSR_ip;
604 Ccolind(CSR_ip) = Cij;
605 Cvals(CSR_ip) = minusOmegaDval * Aval * Bvals(j);
606 CSR_ip++;
607
608 } else {
609 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
610 }
611 }
612
613 } else {
614 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
615 // corresponding to the current entry of A NOT populated (has
616 // a flag "invalid" value), then the corresponding row of B is
617 // in B_local (i.e., it lives on the calling process).
618
619 // Remote matrix
620 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
621 for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
622 LO Ikj = Icolind(j);
623 LO Cij = Icol2Ccol(Ikj);
624
625 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
626 // New entry
627 c_status[Cij] = CSR_ip;
628 Ccolind(CSR_ip) = Cij;
629 Cvals(CSR_ip) = minusOmegaDval * Aval * Ivals(j);
630 CSR_ip++;
631
632 } else {
633 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
634 }
635 }
636 }
637 }
638
639 // Resize for next pass if needed
640 if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1) + 1) * b_max_nnz_per_row) > CSR_alloc) {
641 CSR_alloc *= 2;
642 Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
643 Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
644 }
645 OLD_ip = CSR_ip;
646 }
647 thread_total_nnz(tid) = CSR_ip;
648 tl_colind(tid) = Ccolind;
649 tl_values(tid) = Cvals;
650 });
651
652 // Do the copy out (removed the tl_rowptr!)
653 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
654
655#ifdef HAVE_TPETRA_MMM_TIMINGS
656 MM = Teuchos::null;
657 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix OpenMPSort"))));
658#endif
659 // Sort & set values
660 if (params.is_null() || params->get("sort entries", true)) {
661 // Tpetra's SpGEMM results in almost sorted matrices. Use shell sort.
662 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC, ::KokkosSparse::SortAlgorithm::SHELL);
663 }
664 C.setAllValues(row_mapC, entriesC, valuesC);
665}
666
667/*********************************************************************************************************/
668template <class Scalar,
669 class LocalOrdinal,
670 class GlobalOrdinal,
671 class LocalOrdinalViewType>
672void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
673 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
674 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
675 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
676 const LocalOrdinalViewType& targetMapToOrigRow,
677 const LocalOrdinalViewType& targetMapToImportRow,
678 const LocalOrdinalViewType& Bcol2Ccol,
679 const LocalOrdinalViewType& Icol2Ccol,
680 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
681 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
682 const std::string& label,
683 const Teuchos::RCP<Teuchos::ParameterList>& params) {
684#ifdef HAVE_TPETRA_MMM_TIMINGS
685 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
686 using Teuchos::TimeMonitor;
687 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse LTGCore"))));
688#endif
689 using Teuchos::Array;
690 using Teuchos::ArrayRCP;
691 using Teuchos::ArrayView;
692 using Teuchos::RCP;
693 using Teuchos::rcp;
694
695 // Lots and lots of typedefs
696 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
698 // typedef typename KCRS::device_type device_t;
699 typedef typename KCRS::StaticCrsGraphType graph_t;
700 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
701 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
702 typedef typename KCRS::values_type::non_const_type scalar_view_t;
703
704 // Jacobi-specific
705 typedef typename scalar_view_t::memory_space scalar_memory_space;
706
707 typedef Scalar SC;
708 typedef LocalOrdinal LO;
709 typedef GlobalOrdinal GO;
710 typedef Node NO;
711 typedef Map<LO, GO, NO> map_type;
712
713 // NOTE (mfh 15 Sep 2017) This is specifically only for
714 // execution_space = Kokkos::OpenMP, so we neither need nor want
715 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
716 typedef NO::execution_space execution_space;
717 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
718
719 // All of the invalid guys
720 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
721 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
722 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
723
724 // Grab the Kokkos::SparseCrsMatrices & inner stuff
725 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
726 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
727 const KCRS Cmat = C.getLocalMatrixDevice();
728
729 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
730 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
731 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
732 scalar_view_t Cvals = Cmat.values;
733
734 c_lno_view_t Irowptr;
735 c_lno_nnz_view_t Icolind;
736 scalar_view_t Ivals;
737 if (!Bview.importMatrix.is_null()) {
738 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
739 Irowptr = lclB.graph.row_map;
740 Icolind = lclB.graph.entries;
741 Ivals = lclB.values;
742 }
743
744 // Jacobi-specific inner stuff
745 auto Dvals =
746 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
747
748 // Sizes
749 RCP<const map_type> Ccolmap = C.getColMap();
750 size_t m = Aview.origMatrix->getLocalNumRows();
751 size_t n = Ccolmap->getLocalNumElements();
752
753 // Get my node / thread info (right from openmp or parameter list)
754 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
755 if (!params.is_null()) {
756 if (params->isParameter("openmp: ltg thread max"))
757 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
758 }
759
760 double thread_chunk = (double)(m) / thread_max;
761
762 // Run chunks of the matrix independently
763 Kokkos::parallel_for("Jacobi::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
764 // Thread coordination stuff
765 size_t my_thread_start = tid * thread_chunk;
766 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
767
768 // Allocations
769 std::vector<size_t> c_status(n, INVALID);
770
771 // For each row of A/C
772 size_t CSR_ip = 0, OLD_ip = 0;
773 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
774 // First fill the c_status array w/ locations where we're allowed to
775 // generate nonzeros for this row
776 OLD_ip = Crowptr(i);
777 CSR_ip = Crowptr(i + 1);
778 // NOTE: Vector::getLocalView returns a rank 2 view here
779 SC minusOmegaDval = -omega * Dvals(i, 0);
780
781 for (size_t k = OLD_ip; k < CSR_ip; k++) {
782 c_status[Ccolind(k)] = k;
783 // Reset values in the row of C
784 Cvals(k) = SC_ZERO;
785 }
786
787 // Entries of B
788 for (size_t j = Browptr(i); j < Browptr(i + 1); j++) {
789 const SC Bval = Bvals(j);
790 if (Bval == SC_ZERO)
791 continue;
792 LO Bij = Bcolind(j);
793 LO Cij = Bcol2Ccol(Bij);
794
795 // Assume no repeated entries in B
796 Cvals(c_status[Cij]) += Bvals(j);
797 CSR_ip++;
798 }
799
800 for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
801 LO Aik = Acolind(k);
802 const SC Aval = Avals(k);
803 if (Aval == SC_ZERO)
804 continue;
805
806 if (targetMapToOrigRow(Aik) != LO_INVALID) {
807 // Local matrix
808 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
809
810 for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
811 LO Bkj = Bcolind(j);
812 LO Cij = Bcol2Ccol(Bkj);
813
814 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
815 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
816 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
817
818 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
819 }
820 } else {
821 // Remote matrix
822 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
823 for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
824 LO Ikj = Icolind(j);
825 LO Cij = Icol2Ccol(Ikj);
826
827 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
828 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
829 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
830
831 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
832 }
833 }
834 }
835 }
836 });
837
838 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
839}
840
841/*********************************************************************************************************/
842template <class InColindArrayType, class InValsArrayType,
843 class OutRowptrType, class OutColindType, class OutValsType>
844void copy_out_from_thread_memory(const OutColindType& thread_total_nnz,
845 const InColindArrayType& Incolind,
846 const InValsArrayType& Invalues,
847 const size_t m,
848 const double thread_chunk,
849 OutRowptrType& row_mapC,
850 OutColindType& entriesC,
851 OutValsType& valuesC) {
852 typedef OutRowptrType lno_view_t;
853 typedef OutColindType lno_nnz_view_t;
854 typedef OutValsType scalar_view_t;
855 typedef typename lno_view_t::execution_space execution_space;
856 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
857
858 // Generate the starting nnz number per thread
859 size_t thread_max = Incolind.size();
860 size_t c_nnz_size = 0;
861 // since this kernel is 'low thread count' is very likely, that we could
862 // sum over the thread_total_nnz and not go parallel, but since it is a view
863 // we don't know where the memory actually lives... so we kinda need to go parallel
864
865 lno_view_t thread_start_nnz("thread_nnz", thread_max + 1);
866 Kokkos::parallel_scan("LTG::Scan", range_type(0, thread_max).set_chunk_size(1), [=](const size_t i, size_t& update, const bool final) {
867 size_t mynnz = thread_total_nnz(i);
868 if (final) thread_start_nnz(i) = update;
869 update += mynnz;
870 if (final && i + 1 == thread_max) thread_start_nnz(i + 1) = update;
871 });
872 c_nnz_size = thread_start_nnz(thread_max);
873
874 // 2019 Apr 10 JJE: update the rowptr's final entry here
875 row_mapC(m) = thread_start_nnz(thread_max);
876
877 // Allocate output
878 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
879 entriesC = entriesC_;
880 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
881 valuesC = valuesC_;
882
883 // Copy out
884 Kokkos::parallel_for("LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
885 const size_t my_thread_start = tid * thread_chunk;
886 const size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
887 const size_t nnz_thread_start = thread_start_nnz(tid);
888 // we need this info, since we did the rowptr in place
889 const size_t nnz_thread_stop = thread_start_nnz(tid + 1);
890
891 // There are two fundamental operations:
892 // * Updateing the rowptr with the correct offset
893 // * copying entries and values
894
895 // First, update the rowptr, it since we did it inplace, it is a += operation
896 // in the paper, I experimented with a coloring scheme that had threads do
897 // do their copies in different orders. It wasn't obvious it was beneficial
898 // but it can be replicated, by choosing the array to copy first based on your
899 // thread id % 3
900 if (my_thread_start != 0) {
901 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
902 row_mapC(i) += nnz_thread_start;
903 }
904 }
905
906 // try to Kokkos::single() the alloc here. It should implicitly barrier
907 // thread 0 doesn't copy the rowptr, so it could hit the single first
908 // in the paper, I played a game that effectively got LTG down to a single
909 // OpenMP barrier. But doing that requires the ability to spawn a single
910 // parallel region. The Scan above was implemented using atomic adds
911 // and the barrier was only needed so you could allocate
912 //
913 // Since we can't spawn a single region, we could move the allocations
914 // here, and using 'single'. Most likely, thread 0 will hit it first allowing
915 // the other threads to update the rowptr while it allocates.
916
917 // next, bulk copy the vals/colind arrays
918 const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
919 for (size_t i = 0; i < my_num_nnz; ++i) {
920 entriesC(nnz_thread_start + i) = Incolind(tid)(i);
921 valuesC(nnz_thread_start + i) = Invalues(tid)(i);
922 }
923
924 // Free the unamanged views, let each thread deallocate its memory
925 // May need to cast away const here..
926 if (Incolind(tid).data()) free(Incolind(tid).data());
927 if (Invalues(tid).data()) free(Invalues(tid).data());
928 });
929} // end copy_out
930
931#endif // OpenMP
932
933/*********************************************************************************************************/
934template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
935void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
936 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
937 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
938 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
939 const LocalOrdinalViewType& Acol2Brow,
940 const LocalOrdinalViewType& Acol2Irow,
941 const LocalOrdinalViewType& Bcol2Ccol,
942 const LocalOrdinalViewType& Icol2Ccol,
943 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
944 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Cimport,
945 const std::string& label,
946 const Teuchos::RCP<Teuchos::ParameterList>& params) {
947#ifdef HAVE_TPETRA_MMM_TIMINGS
948 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
949 using Teuchos::TimeMonitor;
950 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK"))));
951 Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Multiply"))));
952 using Teuchos::rcp;
953#endif
954 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
955
956 // This kernel computes (I-omega Dinv A) B the slow way (for generality's sake, you must understand)
957
958 // 1) Multiply A*B
959 Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(new Matrix_t(C.getRowMap(), 0));
960 Tpetra::MMdetails::mult_A_B_newmatrix(Aview, Bview, *AB, label + std::string(" MSAK"), params);
961
962#ifdef HAVE_TPETRA_MMM_TIMINGS
963 MM2 = Teuchos::null;
964 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Scale"))));
965#endif
966
967 // 2) Scale A by Dinv
968 AB->leftScale(Dinv);
969
970#ifdef HAVE_TPETRA_MMM_TIMINGS
971 MM2 = Teuchos::null;
972 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Add"))));
973#endif
974
975 // 3) Add [-omega Dinv A] + B
976 Teuchos::ParameterList jparams;
977 if (!params.is_null()) {
978 jparams = *params;
979 jparams.set("label", label + std::string(" MSAK Add"));
980 }
981 Scalar one = Teuchos::ScalarTraits<Scalar>::one();
982 Tpetra::MatrixMatrix::add(one, false, *Bview.origMatrix, Scalar(-omega), false, *AB, C, AB->getDomainMap(), AB->getRangeMap(), Teuchos::rcp(&jparams, false));
983#ifdef HAVE_TPETRA_MMM_TIMINGS
984 MM2 = Teuchos::null;
985#endif
986} // jacobi_A_B_newmatrix_MultiplyScaleAddKernel
987
988#if defined(HAVE_TPETRA_INST_OPENMP)
989/*********************************************************************************************************/
990template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
991static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
992 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
993 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
994 const LocalOrdinalViewType& Acol2PRow,
995 const LocalOrdinalViewType& Acol2PRowImport,
996 const LocalOrdinalViewType& Pcol2Accol,
997 const LocalOrdinalViewType& PIcol2Accol,
998 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
999 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1000 const std::string& label,
1001 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1002 using Teuchos::RCP;
1003 using Tpetra::MatrixMatrix::UnmanagedView;
1004#ifdef HAVE_TPETRA_MMM_TIMINGS
1005 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1006 using Teuchos::rcp;
1007 using Teuchos::TimeMonitor;
1008 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix LTGCore"))));
1009#endif
1010
1011 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1012 typedef Scalar SC;
1013 typedef LocalOrdinal LO;
1014 typedef GlobalOrdinal GO;
1015 typedef Node NO;
1016 typedef Map<LO, GO, NO> map_type;
1018 typedef typename KCRS::StaticCrsGraphType graph_t;
1019 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1020 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1021 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1022 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1023 typedef typename KCRS::device_type device_t;
1024 typedef typename device_t::execution_space execution_space;
1025 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1026
1027 // Unmanaged versions of the above
1028 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1029 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1030 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1031
1032 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1033 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1034
1035 // Sizes
1036 RCP<const map_type> Accolmap = Ac.getColMap();
1037 size_t m = Rview.origMatrix->getLocalNumRows();
1038 size_t n = Accolmap->getLocalNumElements();
1039
1040 // Get raw Kokkos matrices, and the raw CSR views
1041 const KCRS Rmat = Rview.origMatrix->getLocalMatrixDevice();
1042 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
1043 const KCRS Pmat = Pview.origMatrix->getLocalMatrixDevice();
1044
1045 c_lno_view_t Rrowptr = Rmat.graph.row_map,
1046 Arowptr = Amat.graph.row_map,
1047 Prowptr = Pmat.graph.row_map, Irowptr;
1048 const lno_nnz_view_t Rcolind = Rmat.graph.entries,
1049 Acolind = Amat.graph.entries,
1050 Pcolind = Pmat.graph.entries;
1051 lno_nnz_view_t Icolind;
1052 const scalar_view_t Rvals = Rmat.values,
1053 Avals = Amat.values,
1054 Pvals = Pmat.values;
1055 scalar_view_t Ivals;
1056
1057 if (!Pview.importMatrix.is_null()) {
1058 const KCRS Imat = Pview.importMatrix->getLocalMatrixDevice();
1059 Irowptr = Imat.graph.row_map;
1060 Icolind = Imat.graph.entries;
1061 Ivals = Imat.values;
1062 }
1063
1064 // Classic csr assembly (low memory edition)
1065 //
1066 // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
1067 // The method loops over rows of R, and may resize after processing
1068 // each row. Chris Siefert says that this reflects experience in
1069 // ML; for the non-threaded case, ML found it faster to spend less
1070 // effort on estimation and risk an occasional reallocation.
1071
1072 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1073 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1074
1075 // Get my node / thread info (right from openmp or parameter list)
1076 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1077 if (!params.is_null()) {
1078 if (params->isParameter("openmp: ltg thread max"))
1079 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
1080 }
1081
1082 double thread_chunk = (double)(m) / thread_max;
1083
1084 // we can construct the rowptr inplace, allocate here and fault in parallel
1085 lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
1086 // we do not touch these until copy out
1087 lno_nnz_view_t entriesAc;
1088 scalar_view_t valuesAc;
1089
1090 // add this, since we do the rowptr in place, we could figure this out
1091 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
1092 lno_nnz_view_t thread_total_nnz("thread_total_nnz", thread_max + 1);
1093
1094 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1095 // routine. The routine computes Ac := R * A * (P_local + P_remote).
1096 //
1097 // For column index Aik in row i of A, Acol2PRow[Aik] tells
1098 // you whether the corresponding row of P belongs to P_local
1099 // ("orig") or P_remote ("Import").
1100
1101 // Thread-local memory
1102 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
1103 Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
1104
1105 // For each row of R
1106 Kokkos::parallel_for("MMM::RAP::NewMatrix::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
1107 // Thread coordination stuff
1108 size_t my_thread_start = tid * thread_chunk;
1109 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1110 size_t my_thread_m = my_thread_stop - my_thread_start;
1111
1112 size_t nnzAllocated = (size_t)(my_thread_m * Acest_nnz_per_row + 100);
1113
1114 std::vector<size_t> ac_status(n, INVALID);
1115
1116 // manually allocate the thread-local storage for Ac
1117 u_lno_view_t Acrowptr((typename u_lno_view_t::data_type)malloc(u_lno_view_t::shmem_size(my_thread_m + 1)), my_thread_m + 1);
1118 u_lno_nnz_view_t Accolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1119 u_scalar_view_t Acvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1120
1121 // count entries as they are added to Ac
1122 size_t nnz = 0, nnz_old = 0;
1123 // bmk: loop over the rows of R which are assigned to thread tid
1124 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1125 // directly index into the rowptr
1126 rowmapAc(i) = nnz;
1127 // mfh 27 Sep 2016: For each entry of R in the current row of R
1128 for (size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1129 LO k = Rcolind(kk); // local column index of current entry of R
1130 const SC Rik = Rvals(kk); // value of current entry of R
1131 if (Rik == SC_ZERO)
1132 continue; // skip explicitly stored zero values in R
1133 // For each entry of A in the current row of A
1134 for (size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1135 LO l = Acolind(ll); // local column index of current entry of A
1136 const SC Akl = Avals(ll); // value of current entry of A
1137 if (Akl == SC_ZERO)
1138 continue; // skip explicitly stored zero values in A
1139
1140 if (Acol2PRow[l] != LO_INVALID) {
1141 // mfh 27 Sep 2016: If the entry of Acol2PRow
1142 // corresponding to the current entry of A is populated, then
1143 // the corresponding row of P is in P_local (i.e., it lives on
1144 // the calling process).
1145
1146 // Local matrix
1147 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1148
1149 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1150 for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1151 LO j = Pcolind(jj);
1152 LO Acj = Pcol2Accol(j);
1153 SC Plj = Pvals(jj);
1154
1155 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1156#ifdef HAVE_TPETRA_DEBUG
1157 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1158 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1159 std::runtime_error,
1160 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1161#endif
1162 // New entry
1163 ac_status[Acj] = nnz;
1164 Accolind(nnz) = Acj;
1165 Acvals(nnz) = Rik * Akl * Plj;
1166 nnz++;
1167 } else {
1168 Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1169 }
1170 }
1171 } else {
1172 // mfh 27 Sep 2016: If the entry of Acol2PRow
1173 // corresponding to the current entry of A is NOT populated (has
1174 // a flag "invalid" value), then the corresponding row of P is
1175 // in P_remote (i.e., it does not live on the calling process).
1176
1177 // Remote matrix
1178 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1179 for (size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1180 LO j = Icolind(jj);
1181 LO Acj = PIcol2Accol(j);
1182 SC Plj = Ivals(jj);
1183
1184 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1185#ifdef HAVE_TPETRA_DEBUG
1186 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1187 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1188 std::runtime_error,
1189 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1190#endif
1191 // New entry
1192 ac_status[Acj] = nnz;
1193 Accolind(nnz) = Acj;
1194 Acvals(nnz) = Rik * Akl * Plj;
1195 nnz++;
1196 } else {
1197 Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1198 }
1199 }
1200 }
1201 }
1202 }
1203 // Resize for next pass if needed
1204 if (nnz + n > nnzAllocated) {
1205 nnzAllocated *= 2;
1206 Accolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1207 Acvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*)Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1208 }
1209 nnz_old = nnz;
1210 }
1211 thread_total_nnz(tid) = nnz;
1212 tl_colind(tid) = Accolind;
1213 tl_values(tid) = Acvals;
1214 });
1215#ifdef HAVE_TPETRA_MMM_TIMINGS
1216 MM = Teuchos::null;
1217 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix copy from thread local"))));
1218#endif
1219
1220 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1221
1222#ifdef HAVE_TPETRA_MMM_TIMINGS
1223 MM = Teuchos::null;
1224 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1225#endif
1226
1227 // Final sort & set of CRS arrays
1228 // Tpetra's SpGEMM results in almost sorted matrices. Use shell sort.
1229 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc, ::KokkosSparse::SortAlgorithm::SHELL);
1230 // mfh 27 Sep 2016: This just sets pointers.
1231 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1232
1233#ifdef HAVE_TPETRA_MMM_TIMINGS
1234 MM = Teuchos::null;
1235 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1236#endif
1237
1238 // Final FillComplete
1239 //
1240 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1241 // Import (from domain Map to column Map) construction (which costs
1242 // lots of communication) by taking the previously constructed
1243 // Import object. We should be able to do this without interfering
1244 // with the implementation of the local part of sparse matrix-matrix
1245 // multiply above.
1246 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1247 labelList->set("Timer Label", label);
1248 if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
1249 RCP<const Export<LO, GO, NO> > dummyExport;
1250 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1251 Rview.origMatrix->getRangeMap(),
1252 Acimport,
1253 dummyExport,
1254 labelList);
1255}
1256
1257/*********************************************************************************************************/
1258template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1259static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1260 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1261 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1262 const LocalOrdinalViewType& Acol2PRow,
1263 const LocalOrdinalViewType& Acol2PRowImport,
1264 const LocalOrdinalViewType& Pcol2Accol,
1265 const LocalOrdinalViewType& PIcol2Accol,
1266 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1267 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1268 const std::string& label,
1269 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1270 using Teuchos::RCP;
1271 using Tpetra::MatrixMatrix::UnmanagedView;
1272#ifdef HAVE_TPETRA_MMM_TIMINGS
1273 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1274 using Teuchos::rcp;
1275 using Teuchos::TimeMonitor;
1276 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse LTGCore"))));
1277#endif
1278
1279 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1280 typedef Scalar SC;
1281 typedef LocalOrdinal LO;
1282 typedef GlobalOrdinal GO;
1283 typedef Node NO;
1284 typedef Map<LO, GO, NO> map_type;
1286 typedef typename KCRS::StaticCrsGraphType graph_t;
1287 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1288 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1289 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1290 typedef typename KCRS::device_type device_t;
1291 typedef typename device_t::execution_space execution_space;
1292 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1293
1294 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1295 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1296 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1297
1298 // Sizes
1299 RCP<const map_type> Accolmap = Ac.getColMap();
1300 size_t m = Rview.origMatrix->getLocalNumRows();
1301 size_t n = Accolmap->getLocalNumElements();
1302
1303 // Get raw Kokkos matrices, and the raw CSR views
1304 const KCRS Rmat = Rview.origMatrix->getLocalMatrixDevice();
1305 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
1306 const KCRS Pmat = Pview.origMatrix->getLocalMatrixDevice();
1307 const KCRS Cmat = Ac.getLocalMatrixDevice();
1308
1309 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1310 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1311 lno_nnz_view_t Icolind;
1312 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1313 scalar_view_t Cvals = Cmat.values;
1314 scalar_view_t Ivals;
1315
1316 if (!Pview.importMatrix.is_null()) {
1317 const KCRS Imat = Pview.importMatrix->getLocalMatrixDevice();
1318 Irowptr = Imat.graph.row_map;
1319 Icolind = Imat.graph.entries;
1320 Ivals = Imat.values;
1321 }
1322
1323 // Get my node / thread info (right from openmp or parameter list)
1324 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1325 if (!params.is_null()) {
1326 if (params->isParameter("openmp: ltg thread max"))
1327 thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
1328 }
1329
1330 double thread_chunk = (double)(m) / thread_max;
1331
1332 // For each row of R
1333 Kokkos::parallel_for("MMM::RAP::Reuse::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
1334 // Thread coordination stuff
1335 size_t my_thread_start = tid * thread_chunk;
1336 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1337
1338 std::vector<size_t> c_status(n, INVALID);
1339
1340 // count entries as they are added to Ac
1341 size_t OLD_ip = 0, CSR_ip = 0;
1342 // bmk: loop over the rows of R which are assigned to thread tid
1343 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1344 // First fill the c_status array w/ locations where we're allowed to
1345 // generate nonzeros for this row
1346 OLD_ip = Crowptr(i);
1347 CSR_ip = Crowptr(i + 1);
1348 for (size_t k = OLD_ip; k < CSR_ip; k++) {
1349 c_status[Ccolind(k)] = k;
1350 // Reset values in the row of C
1351 Cvals(k) = SC_ZERO;
1352 }
1353
1354 // mfh 27 Sep 2016: For each entry of R in the current row of R
1355 for (size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1356 LO k = Rcolind(kk); // local column index of current entry of R
1357 const SC Rik = Rvals(kk); // value of current entry of R
1358 if (Rik == SC_ZERO)
1359 continue; // skip explicitly stored zero values in R
1360 // For each entry of A in the current row of A
1361 for (size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1362 LO l = Acolind(ll); // local column index of current entry of A
1363 const SC Akl = Avals(ll); // value of current entry of A
1364 if (Akl == SC_ZERO)
1365 continue; // skip explicitly stored zero values in A
1366
1367 if (Acol2PRow[l] != LO_INVALID) {
1368 // mfh 27 Sep 2016: If the entry of Acol2PRow
1369 // corresponding to the current entry of A is populated, then
1370 // the corresponding row of P is in P_local (i.e., it lives on
1371 // the calling process).
1372
1373 // Local matrix
1374 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1375
1376 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1377 for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1378 LO j = Pcolind(jj);
1379 LO Cij = Pcol2Accol(j);
1380 SC Plj = Pvals(jj);
1381 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1382 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
1383 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1384
1385 Cvals(c_status[Cij]) += Rik * Akl * Plj;
1386 }
1387 } else {
1388 // mfh 27 Sep 2016: If the entry of Acol2PRow
1389 // corresponding to the current entry of A is NOT populated (has
1390 // a flag "invalid" value), then the corresponding row of P is
1391 // in P_remote (i.e., it does not live on the calling process).
1392
1393 // Remote matrix
1394 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1395 for (size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1396 LO j = Icolind(jj);
1397 LO Cij = PIcol2Accol(j);
1398 SC Plj = Ivals(jj);
1399 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1400 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
1401 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1402
1403 Cvals(c_status[Cij]) += Rik * Akl * Plj;
1404 }
1405 }
1406 }
1407 }
1408 }
1409 });
1410 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
1411}
1412
1413#endif
1414
1415} // namespace ExtraKernels
1416} // namespace MatrixMatrix
1417} // namespace Tpetra
1418
1419#endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
Namespace Tpetra contains the class and methods constituting the Tpetra library.